Exemplo n.º 1
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, \
                                   self.c_maxlen, \
                                   self.q_maxlen, \
                                   config.char_limit, \
                                   config.hidden, \
                                   config.char_dim, \
                                   config.num_heads

        with tf.variable_scope('Input_Embedding_Layer', regularizer=regularizer):
            # ******************** char embedding *********************
            # [batch_size, seq_len, word_len] -> [batch_size x seq_len, word_len, char_dim]
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                shape=[N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                shape=[N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, keep_prob=1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, keep_prob=1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder, share weights
            # [N * PL/QL, CL, d]
            ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name='char_conv', reuse=None)
            qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name='char_conv', reuse=True)

            # [N * CL/QL, d], reduce max along CL
            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            # [N, PL/QL, d]
            ch_emb = tf.reshape(ch_emb, shape=[N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, shape=[N, QL, ch_emb.shape[-1]])

            # *********************** Word embedding ************************
            # [N, PL/QL, dw]
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c),
                                  keep_prob=1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q),
                                  keep_prob=1.0 - self.dropout)

            # Concat char embedding and word embedding
            # [N, PL/QL, dw + d]
            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            # share weights
            c_emb = highway(c_emb, size=d, scope='highway', dropout=self.dropout, reuse=None)
            q_emb = highway(q_emb, size=d, scope='highway', dropout=self.dropout, reuse=True)

            print('highway, q_emb.shape: {}'.format(q_emb.shape))
            print('highway, c_emb.shape: {}'.format(c_emb.shape))

        """ *************************************Encoer ****************************************"""
		with tf.variable_scope('Encoder_Layer', regularizer=regularizer):
Exemplo n.º 2
0
    def _embed(self):
        with tf.variable_scope('word_char_embedding'):

            if self.config.fix_pretrained_vector:
                self.pretrained_word_mat = tf.get_variable(
                    "word_emb_mat",
                    [self.vocab.word_size() - 2, self.vocab.word_embed_dim],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(
                        self.vocab.word_embeddings[2:], dtype=tf.float32),
                    trainable=False)
                self.word_pad_unk_mat = tf.get_variable(
                    "word_unk_pad",
                    [2, self.pretrained_word_mat.get_shape()[1]],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(
                        self.vocab.word_embeddings[:2], dtype=tf.float32),
                    trainable=True)

                self.word_mat = tf.concat(
                    [self.word_pad_unk_mat, self.pretrained_word_mat], axis=0)

            else:
                self.word_mat = tf.get_variable(
                    'word_embeddings',
                    shape=[self.vocab.word_size(), self.vocab.word_embed_dim],
                    initializer=tf.constant_initializer(
                        self.vocab.word_embeddings),
                    trainable=True)

        PL, QL, CL, d, dc, nh = self._params()
        with tf.variable_scope("Input_Embedding_Layer"):

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            self.c_emb = highway(c_emb,
                                 size=d,
                                 scope="highway",
                                 dropout=self.dropout,
                                 reuse=None)
            self.q_emb = highway(q_emb,
                                 size=d,
                                 scope="highway",
                                 dropout=self.dropout,
                                 reuse=True)
Exemplo n.º 3
0
def test_charcnn():
    config = {}
    config['batch_size'] = 8
    config['word_maxlen'] = 10
    config['char_emb_size'] = 5
    config['dropout_cnn'] = 0.1
    CNN = Char_CNN(config)
    highway = HighwayMLP(300)
    X = torch.rand((8, 7, 10, 5))
    out = CNN(X)
    out1 = highway(out)
    print(out1.shape)
Exemplo n.º 4
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer",
                               initializer=xavier_initializer()):

            from networks import QAnet_contextual_embedding
            params = user_params(procedure=None,
                                 label_name=None,
                                 learning_rate=None,
                                 embed_size=None,
                                 embedding_file_path=None,
                                 context_name="context",
                                 question_name="question",
                                 rnn_hidden_size=None,
                                 data_dir=None,
                                 model_dir=None,
                                 batch_size=None,
                                 drop_out_rate=self.dropout,
                                 p1=None,
                                 p2=None,
                                 feature_voc_file_path=None,
                                 gpu_cores_list=None,
                                 transfromer_conv_layers=4,
                                 transfromer_conv_kernel_size=7,
                                 transfromer_head_number=nh,
                                 tansformer_d_model=d,
                                 clip_norm=None,
                                 use_char_embedding=None,
                                 char_embedding_size=None,
                                 char_feature_name=None,
                                 char_question_name=None,
                                 example_max_length=None,
                                 enable_ema=None,
                                 ema_decay=None,
                                 char_filters=None,
                                 ans_limit=None)
            encoder = QAnetEmbedding(params, d, self.trainable)
            input = {
                params.context_name:
                c_emb,
                params.question_name:
                q_emb,
                "context_mask":
                tf.cast(tf.expand_dims(tf.sign(self.c), -1), tf.float32),
                "question_mask":
                tf.cast(tf.expand_dims(tf.sign(self.q), -1), tf.float32)
            }
            output = encoder(input)
            c = output[params.context_name]
            q = output[params.question_name]

            # c = residual_block(c_emb,
            #     num_blocks = 1,
            #     num_conv_layers = 4,
            #     kernel_size = 7,
            #     mask = self.c_mask,
            #     num_filters = d,
            #     num_heads = nh,
            #     seq_len = self.c_len,
            #     scope = "Encoder_Residual_Block",
            #     bias = False,
            #     dropout = self.dropout)
            # q = residual_block(q_emb,
            #     num_blocks = 1,
            #     num_conv_layers = 4,
            #     kernel_size = 7,
            #     mask = self.q_mask,
            #     num_filters = d,
            #     num_heads = nh,
            #     seq_len = self.q_len,
            #     scope = "Encoder_Residual_Block",
            #     reuse = True, # Share the weights between passage and question
            #     bias = False,
            #     dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer",
                               initializer=xavier_initializer()):

            from networks.dcn import DcnLayer
            params.q_seq_len = self.q_maxlen
            params.sent_number = 1
            params.c_seq_len = self.c_maxlen
            params.cur_batch_size = tf.shape(c)[0]
            dcn = DcnLayer(params, d, self.trainable)
            output = dcn(output)
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            #[batch_size,c_len_,q_len]
            # S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
            # #[batch_size,1,q_len]
            # mask_q = tf.expand_dims(self.q_mask, 1)
            # S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            # mask_c = tf.expand_dims(self.c_mask, 2)
            # S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
            # self.c2q = tf.matmul(S_, q)
            # self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            # attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer",
                               initializer=xavier_initializer()):

            block = QaModelBlock(params, 4 * d, self.trainable)
            input = {
                params.context_name:
                output[params.
                       context_name]  #tf.concat(attention_outputs, axis = -1)
                ,
                "context_mask":
                tf.cast(tf.expand_dims(tf.sign(self.c), -1), tf.float32),
                "question_mask":
                tf.cast(tf.expand_dims(tf.sign(self.q), -1), tf.float32)
            }
            output = block(input)
            # inputs = tf.concat(attention_outputs, axis = -1)
            # self.enc = [conv(inputs, d, name = "input_projection")]
            # for i in range(3):
            #     if i % 2 == 0: # dropout every 2 blocks
            #         self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
            #     self.enc.append(
            #         residual_block(self.enc[i],
            #             num_blocks = 7,
            #             num_conv_layers = 2,
            #             kernel_size = 5,
            #             mask = self.c_mask,
            #             num_filters = d,
            #             num_heads = nh,
            #             seq_len = self.c_len,
            #             scope = "Model_Encoder",
            #             bias = False,
            #             reuse = True if i > 0 else None,
            #             dropout = self.dropout)
            #         )

    # self.enc = [None,output["M0"],output["M1"],output["M2"]]
        with tf.variable_scope("Output_Layer",
                               initializer=xavier_initializer()):
            # start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1)
            # end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1)
            # self.logits = [mask_logits(start_logits, mask = self.c_mask),
            #                mask_logits(end_logits, mask = self.c_mask)]
            #
            # logits1, logits2 = [l for l in self.logits]
            outlayer = QAOutputLayer(params,
                                     feature_size=d,
                                     is_trainning=self.trainable)
            logits1, logits2 = outlayer(output)
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 5
0
Arquivo: model.py Projeto: txye/QANet
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

			# Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None)
            qh_emb = conv(qh_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True)

            ch_emb = tf.reduce_max(ch_emb, axis = 1)
            qh_emb = tf.reduce_max(qh_emb, axis = 1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis = -1)
            self.enc = [conv(inputs, d, name = "input_projection")]
            for i in range(3):
                if i % 2 == 0: # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                        num_blocks = 7,
                        num_conv_layers = 2,
                        kernel_size = 5,
                        mask = self.c_mask,
                        num_filters = d,
                        num_heads = nh,
                        seq_len = self.c_len,
                        scope = "Model_Encoder",
                        bias = False,
                        reuse = True if i > 0 else None,
                        dropout = self.dropout)
                    )

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1)
            end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1)
            self.logits = [mask_logits(start_logits, mask = self.c_mask),
                           mask_logits(end_logits, mask = self.c_mask)]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var,v))
Exemplo n.º 6
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads
        d_cell = tf.contrib.rnn.BasicLSTMCell(d,
                                              forget_bias=1.0,
                                              state_is_tuple=True)
        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            print "ch_emb before", ch_emb.shape[-1]
            print "qh_emb before", qh_emb.shape[-1]

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])
            print "N", N, "PL", PL, "QL", QL
            print "ch_emb", ch_emb.shape
            print "qh_emb", qh_emb.shape
            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)
            print "c_emb high", c_emb.shape
            print "q_emb high", q_emb.shape

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c_tmp = residual_block(c_emb,
                                   num_blocks=1,
                                   num_conv_layers=4,
                                   kernel_size=7,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Encoder_Residual_Block",
                                   bias=False,
                                   dropout=self.dropout)

            # c_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            c = drnn(d_cell, c_tmp, d)

            q_tmp = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            # q_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            q = drnn(d_cell, q_tmp, d)
            print "embd enc output c", c.shape
            print "embd enc output q", q.shape
            # exit()

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            print "enc len", len(self.enc)
            # print self.ch_len.shape
            # print self.qh_len.shape
            # print self.c_len.shape
            # print self.q_len.shape

            # print ip_len.shape
            print "qh shape", self.qh.shape
            print "qh type", self.qh.dtype
            print "ip shape", inputs.shape
            print "ip type", inputs.dtype
            ip_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(inputs, tf.bool), tf.float32),
                              axis=2), [-1])
            print "ip_len", ip_len.shape

            # fw0 = drnn(d_cell, self.enc[0], d)
            # f_cell = tf.contrib.rnn.BasicLSTMCell(fw0[2], forget_bias=1.0, state_is_tuple=True)
            # fw1 = drnn(d_cell, fw0, d)
            # fw2 = drnn(d_cell, fw1, d)
            # self.enc.append(fw0)
            # self.enc.append(fw1)
            # self.enc.append(fw2)
            # print "fw1 shape", fw1
            #
            # (fw0, bw0), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
            #                   initial_state_fw=None, initial_state_bw=None,
            #                   dtype=None, parallel_iterations=None,
            #                   swap_memory=False, time_major=False, scope=None):

            # bw_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            # g0 = bidirlstm(fw_cell, bw_cell, inputs, d)
            # g1 = bidirlstm(fw_cell, bw_cell, g0, d)
            # g2 = bidirlstm(fw_cell, bw_cell, g1, d)
            # fw0 = bidirlstm(d_cell, d_cell, inputs, d)
            # d_cell1 = tf.contrib.rnn.BasicLSTMCell(fw0[1], forget_bias=1.0, state_is_tuple=True)
            # fw1 = bidirlstm(d_cell1, d_cell1, fw0, d)
            # (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, self.enc[0], dtype='float', scope='g0')  # [N, M, JX, 2d]
            # g0 = tf.concat([fw_g0, bw_g0], 4)

            # (fw_g1, bw_g1) = bidirectional_dynamic_rnn(d_cell, d_cell, fw_g0, dtype='float', scope='g1')  # [N, M, JX, 2d]
            # print "fw_g0", fw_g0.shape
            # print "bw_g0", bw_g0.shape

            # print g0.shape
            # (fw_g1, bw_g1), _ = bidirlstm(d_cell, d_cell, g0, dtype='float', scope='g1')  # [N, M, JX, 2d]
            # g1 = tf.concat([fw_g1, bw_g1], 3)
            # flat_output_fw = nest.flatten(fw_g0)
            # flat_output_bw = nest.flatten(bw_g0)

            # flat_outputs = tuple(array_ops.concat(1, [fw, bw])
            #                     for fw, bw in zip(flat_output_fw, flat_output_bw))

            # outputs = nest.pack_sequence_as(structure=output_fw,
            # flat_sequence=flat_outputs)
            # print "output", outputs.shape

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    drnn(
                        d_cell,
                        residual_block(self.enc[i],
                                       num_blocks=7,
                                       num_conv_layers=2,
                                       kernel_size=5,
                                       mask=self.c_mask,
                                       num_filters=d,
                                       num_heads=nh,
                                       seq_len=self.c_len,
                                       scope="Model_Encoder",
                                       bias=False,
                                       reuse=True if i > 0 else None,
                                       dropout=self.dropout), d))
            # print "enc[0] shape", self.enc[0].shape
            print "chalala"
            # exit()

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 7
0
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        XL = self.x_maxlen

        # DEBUG
        self.debug_ops.extend([PL, QL, XL])

        CL = config.char_limit  # 16
        d = config.hidden       # 96
        dc = config.char_dim    # 64
        nh = config.num_heads   # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
                self.xh : (N, x_maxlen, 16)
            '''
            ######################################
            #get elmo embeddings
            ######################################
            datadir = "/data/elmo_experiment_20180906/20180906_model"
            vocab_file = os.path.join(datadir, 'vocab-2016-09-10.txt')
            options_file = os.path.join(datadir, 'options.json')
            weight_file = os.path.join(datadir, 'weights.hdf5')
            print(vocab_file)
            print(options_file)
            print(weight_file)
            
            # Create a Batcher to map text to character ids.
            batcher = Batcher(vocab_file, 50)
            
            # Input placeholders to the biLM.
            #context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
            #question_character_ids = tf.placeholder('int32', shape=(None, None, 50))
            
            # Build the biLM graph.
            bilm = BidirectionalLanguageModel(options_file, weight_file)
            
            # Get ops to compute the LM embeddings.
            print(self.c)
            print(self.c.shape)
            #print(self.ch)
            #print(self.ch.shape)
            print(self.c_elmo)
            print(self.c_elmo.shape)
            print(self.q_elmo)
            print(self.q_elmo.shape)
            print(self.x_elmo)
            print(self.x_elmo.shape)
             
            context_embeddings_op = bilm(self.c_elmo)
            question_embeddings_op = bilm(self.q_elmo)
            candidate_embeddings_op = bilm(self.x_elmo)
            
            # Get an op to compute ELMo (weighted average of the internal biLM layers)
            # Our SQuAD model includes ELMo at both the input and output layers
            # of the task GRU, so we need 4x ELMo representations for the question
            # and context at each of the input and output.
            # We use the same ELMo weights for both the question and context
            # at each of the input and output.
            #context elmo
            elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                elmo_question_input = weight_layers(
                    'input', question_embeddings_op, l2_coef=0.0
                )
                elmo_candidate_input = weight_layers(
                    'input', candidate_embeddings_op, l2_coef=0.0
                )
            
            elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.0
            )
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                elmo_question_output = weight_layers(
                    'output', question_embeddings_op, l2_coef=0.0
                )
                elmo_candidate_output = weight_layers(
                    'output', candidate_embeddings_op, l2_coef=0.0
                )
            
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #(N*PL,16,64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) #(N*QL,16,64)
            xh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.xh), [N * XL, CL, dc]) #(N*XL,16,64)

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            xh_emb = tf.nn.dropout(xh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name = "char_conv", reuse = None) # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name = "char_conv", reuse = True) # (N*q_maxlen, 16-5+1, 96)
            xh_emb = conv(xh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name="char_conv", reuse=True)  # (N*x_maxlen, 16-5+1, 96)

            # Max Pooling
            ch_emb = tf.reduce_max(ch_emb, axis = 1) # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis = 1) # (N*q_maxlen, 96)
            xh_emb = tf.reduce_max(xh_emb, axis = 1) # (N*x_maxlen, 96)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) # (N, q_maxlen, 96)
            xh_emb = tf.reshape(xh_emb, [N, XL, xh_emb.shape[-1]]) # (N, x_maxlen, 96)

            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
                self.x : (N, x_maxlen)
            '''
            #print(self.c)
            #print(self.q)
            #print(self.x)
            
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)#(N,c_maxlen,300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)#(N,q_maxlen,300)
            x_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.x), 1.0 - self.dropout)#(N,x_maxlen,300)

            #c_emb_elmo = 
            #q_emb_elmo = 
            #x_emb_elmo = 

            c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396)
            x_emb = tf.concat([x_emb, xh_emb], axis=2) # (N, x_maxlen, 396)
            
            print(c_emb)
            print(c_emb.shape)
            
            c_emb = tf.concat([elmo_context_output['weighted_op'], c_emb], axis=2) # (N, c_maxlen, 1024 + 396)
            q_emb = tf.concat([elmo_question_output['weighted_op'], q_emb], axis=2) # (N, q_maxlen, 1024 + 396)
            x_emb = tf.concat([elmo_candidate_output['weighted_op'], x_emb], axis=2) # (N, x_maxlen, 1024 + 396)
            
            print(c_emb)
            print(c_emb.shape)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)#(N,c_maxlen,96)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,q_maxlen,96)
            x_emb = highway(x_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,x_maxlen,96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)

            # optimization from jasonwbw
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
            # if config.q2c:
            #     attention_outputs.append(c * self.q2c)

        # with tf.variable_scope("Model_Encoder_Layer"):
        #     inputs = tf.concat(attention_outputs, axis = -1)
        #
        #     # same as a dxd MLP layer
        #     self.enc = [conv(inputs, d, name = "input_projection")] # d=hidden=96
        #
        #     for i in range(3):
        #         if i % 2 == 0: # dropout every 2 blocks
        #             self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
        #         self.enc.append(
        #             residual_block(self.enc[i],
        #                 num_blocks = 7,
        #                 num_conv_layers = 2,
        #                 kernel_size = 5,
        #                 mask = self.c_mask,
        #                 num_filters = d,
        #                 num_heads = nh,
        #                 seq_len = self.c_len,
        #                 scope = "Model_Encoder",
        #                 bias = False,
        #                 reuse = True if i > 0 else None,
        #                 dropout = self.dropout)
        #             )

            # DEBUG
            # self.debug_ops.append(inputs)
            # self.debug_ops.extend(self.enc)

        with tf.variable_scope("Output_Layer"):
            '''
                broadcasting:dimensions with size 1 are stretched or "copied" to match the other
            '''
            '''
                x_emb:              (N, x_maxlen, d)
                inputs:             (N, c_maxlen, 4*d)
                mask_x:             (N, x_maxlen, 1)
                c_proj:             (N, c_maxlen, d)
                S_xc/S_xc_:         (N, x_maxlen, c_maxlen)
                x2c:                (N, x_maxlen, d)
                xp_exp:             (N, x_maxlen, c_maxlen, 1)
                c_proj_exp:         (N, 1, c_maxlen, d)
                cand_context:       (N, x_maxlen, c_maxlen, d)
                cand_context_pool:  (N, x_maxlen, d)
                cand_condense:      (N, x_maxlen, d*2)
                self.cand_condense: (N, x_maxlen, d)
                self.cand_logits:   (N, x_maxlen, 1)
            '''
            inputs = tf.concat(attention_outputs, axis = -1)

            # masking candidate embedding
            mask_x = tf.expand_dims(self.x_mask, 2)
            c_proj = conv(inputs, d, name="context_projection")

            S_xc = optimized_trilinear_for_attention([x_emb, c_proj], self.x_maxlen, self.c_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)
            S_xc_ = tf.nn.softmax(mask_logits(S_xc, mask = mask_x))

            self.x2c = tf.matmul(S_xc_, c_proj)

            self.cand_condense = self.x2c

            if self.config.cand_condense_vector:
                xp_exp = tf.expand_dims(self.xp, axis=-1)
                c_proj_exp = tf.expand_dims(c_proj, axis=1)
                cand_context = tf.multiply(c_proj_exp, xp_exp)

                if self.config.cand_condense_conv:
                    cand_context = tf.reshape(cand_context, [N*XL, PL, d])
                    cand_context = conv(cand_context, d, bias=True, activation=tf.nn.relu,
                                        kernel_size=3, name="candidate_from_context")
                    cand_context = tf.reshape(cand_context, [N, XL, -1, d])

                if self.config.cand_condense_pool:
                    cand_context_pool = tf.reduce_max(cand_context, axis=-2)
                else:
                    cand_context_pool = tf.reduce_mean(cand_context, axis=-2)

                cand_condense = tf.concat([self.x2c, cand_context_pool], axis = -1)
                self.cand_condense = conv(cand_condense, d, name="candidate_projection")

                if self.config.cand_fuse_vector:
                    raise NotImplementedError

                # DEBUG
                self.debug_ops.extend([xp_exp, c_proj_exp, cand_context, cand_context_pool,
                                       cand_condense, self.cand_condense])

            if not config.max_margin:
                cand_logits = tf.squeeze(conv(self.cand_condense, 1, bias=False, name="candidate_logits_1"), -1)
                self.cand_logits = mask_logits(cand_logits, mask=self.x_mask)
                loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.cand_logits, labels=self.yx)
                # DEBUG
                self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c,
                                       self.x_mask, self.cand_logits, self.yx])
            else:
                cand_logits = conv(self.cand_condense, 1, bias=False, name="candidate_logits_1")
                cand_logits = tf.tanh(cand_logits)
                cand_logits = tf.squeeze(conv(cand_logits, 1, bias=False, name="candidate_logits_2"), -1)
                self.cand_logits = tf.sigmoid(cand_logits)
                pos = tf.multiply(self.cand_logits, self.yx)
                pos = tf.reduce_max(pos, axis=-1)
                negs = tf.multiply(self.cand_logits, self.yx_inv)
                neg = tf.reduce_max(negs, axis=-1)
                loss = tf.maximum(tf.add(tf.subtract(neg, pos), config.margin), 0.0)
                # DEBUG
                self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c,
                                       self.x_mask, self.cand_logits, self.yx,
                                       pos, negs, neg, self.yx, self.yx_inv])

            self.loss = tf.reduce_mean(loss)

        # with tf.variable_scope("Output_Layer"):
        #     '''
        #         tf.matrix_band_part: Copy a tensor setting everything outside a central band
        #                              in each innermost matrix to zero.
        #         self.enc[i]:  (N, c_maxlen, d)
        #         start_logits: (N, c_maxlen)
        #         end_logits:   (N, c_maxlen)
        #         logits1:      (N, c_maxlen)
        #         logits2:      (N, c_maxlen)
        #         outer:        (N, c_maxlen, c_maxlen)
        #         self.c_mask:  (N, c_maxlen)
        #         yp1, yp2, losses, losses2: (N,)
        #     '''
        #
        #     # map vectors to scalars
        #     start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1,
        #                                    bias = False, name = "start_pointer"),-1)
        #     end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1,
        #                                  bias = False, name = "end_pointer"), -1)
        #     self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)]
        #
        #     logits1, logits2 = [l for l in self.logits]
        #
        #     losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1)
        #     losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2)
        #     self.loss = tf.reduce_mean(losses + losses2)
        #
        #     # find max-score span
        #     outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
        #                       tf.expand_dims(tf.nn.softmax(logits2), axis=1))
        #     # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
        #     outer = tf.matrix_band_part(outer, 0, config.ans_limit)
        #     self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        #     self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
        #
        #     # DEBUG
        #     self.debug_ops.extend([start_logits, end_logits, logits1, logits2,
        #                            outer, self.yp1, self.yp2, losses, losses2, self.loss])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
                self.assign_vars = []
                # self.shadow_vars = []
                # self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 8
0
    def forward(self):
        config = self.config
        '''
        N: batch_size
        PL: passage最大长度
        QL: question最大长度
        CL: 单词最大字母长度
        d: 输出通道数
        dc: 字母的嵌入维度
        nh: 自注意力的头数
        '''
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads
        # Embedding层:获取词向量和字符向量的拼接
        with tf.variable_scope("Input_Embedding_Layer"):
            # # character嵌入:
            # 1、先对单词的每个字母进行char2vec
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # 2、将单词对应的word2vec矩阵通过conv编码成向量
            # 卷积 ch_emb_shape = [N * PL, CL-5+1, d], qh_emb_shape = [N * QL, CL-5+1, d]
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            # max_time_pooling
            # ch_emb_shape = [N * PL, d], qh_emb_shape = [N * QL, d]
            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            # ch_emb_shape = [N, PL, d], qh_emb_shape = [N, QL, d]
            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            # # 词嵌入:从glove获取
            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            # 拼接词向量和字符向量
            # c_emb_size = [batch, n_c, c_emb+ch_emb]
            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            # q_emb_size = [batch, n_q, c_emb + ch_emb]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            # 分别通过highway网络
            # c_emb_size = [batch, n_c, d]
            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            # c_emb_size = [batch, n_q, d]
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        # Stacking Embedding Encoder Block的实现:共1个encoder block,每个7个卷积层,卷积核数d=96
        with tf.variable_scope("Embedding_Encoder_Layer"):
            # c_size = [batch, n_c, d]
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # q_size = [batch, n_q, d]
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=
                True,  # 共享passage和question的Stacking Embedding Encoder Block的权重
                bias=False,
                dropout=self.dropout)

        # Context-Query-Attention实现:
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            # S_size = [batch, n_c, n_q], q_size = [batch, n_q, d], c_size = [batch, n_c, d]
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            # n_q方向进行softmax
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q), dim=-1)
            mask_c = tf.expand_dims(self.c_mask, 2)
            # n_c方向进行softmax
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            # c2q_size = [batch, n_c, d]
            self.c2q = tf.matmul(S_, q)
            # q2c_size = [batch, n_c, d]
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            # attention_size = [4, batch, n_c, d]
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        # Stacked Model Encoder Blocks实现:共7个encoder block,每个2个卷积层,卷积核数d=96
        with tf.variable_scope("Model_Encoder_Layer"):
            # c, self.c2q, c * self.c2q, c * self.q2c 按照通道维度进行合并
            # input_shape = [batch, n_c, 4d]
            inputs = tf.concat(attention_outputs, axis=-1)
            # self.enc[i]_shape = [batch, n_c, d]
            self.enc = [conv(inputs, d, name="input_projection")]
            # 3个Stacked Model Encoder Blocks
            for i in range(3):
                if i % 2 == 0:  # 每两层进行一次dropout
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(
                        self.enc[i],
                        num_blocks=7,
                        num_conv_layers=2,
                        kernel_size=5,
                        mask=self.c_mask,
                        num_filters=d,
                        num_heads=nh,
                        seq_len=self.c_len,
                        scope="Model_Encoder",
                        bias=False,
                        reuse=True if i > 0 else
                        None,  # 共享同一个Stacked Model Encoder Blocks的权重
                        dropout=self.dropout))

        # 输出层实现:
        with tf.variable_scope("Output_Layer"):
            # 合并Stacked Model Encoder Blocks的第一个和第二个输出,并和并通道
            # start_logits_shape = [batch, n_c, 1]
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            # 合并Stacked Model Encoder Blocks的第一个和第三个输出,并和并通道
            # end_logits_shape = [batch, n_c, 1]
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)

            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            # outer_shape = [bacth, n_c, n_c]
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # 保留行坐标<纵坐标,且行坐标+纵坐标<=ans_limit的数据,其余置0
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            # 最大值的行坐标,代表起始位置
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            # 最大值的列坐标,代表结束位置
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        # L2正则化
        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            # control_dependencies传入的操作是先于with后的操作
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 9
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])

            # shape = (?, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            # d(hidden_size) = 96
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)
            # shape = (?, 12, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            # shape = (?, 96)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])
            # shape = (32, ?, 96)

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            # self.enc[1] = (32, ?, 96)
            conv1 = conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                         1,
                         bias=False,
                         name="start_pointer")
            # tf.shape(conv1) = (32, ?, 1)
            start_logits = tf.squeeze(conv1, -1)
            # tf.shape(start_logits) = (32, ?)
            conv2 = conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                         1,
                         bias=False,
                         name="end_pointer")
            end_logits = tf.squeeze(conv2, -1)

            # mask ??
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            # shape = (32, ?) -> cause the context length is variable
            # matmul([32, ?, 1] x [32, 1, ?])
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # outer = (32, ?, ?)
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)

            reduced1 = tf.reduce_max(outer, axis=2)
            reduced2 = tf.reduce_max(outer, axis=1)
            # tf.shape(reduced) = (32, ?)

            # ###############################################
            paddings = [[0, 0], [0, self.MAX_PL - tf.shape(reduced1)[0]]]

            reduced1 = tf.pad(reduced1, paddings, "CONSTANT")
            reduced2 = tf.pad(reduced2, paddings, "CONSTANT")

            reduced1 = tf.slice(reduced1, [0, 0], [N, self.MAX_PL])
            reduced2 = tf.slice(reduced2, [0, 0], [N, self.MAX_PL])
            # tf.shape(reduced) = (32, ?)

            # no answer flag: (no_answer, answer_exist)
            # TODO add additinal layer
            # TODO dimenstion between reduced and weight
            na_flag1 = tf.cast(
                tf.argmax(tf.matmul(reduced1, self.weights1), axis=1),
                tf.float32)
            na_flag2 = tf.cast(
                tf.argmax(tf.matmul(reduced2, self.weights2), axis=1),
                tf.float32)
            # Tensor("Output_Layer/ArgMax:0", shape=(32, ?), dtype=int64)

            self.yp1 = tf.argmax(reduced1, axis=1)
            self.yp2 = tf.argmax(reduced2, axis=1)

            print(tf.reduce_sum(reduced1, axis=1))
            print(tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)))
            print(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                           labels=self.y1))

            # no_answer
            losses = tf.where(
                self.no_answer > 0,
                tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)),
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                           labels=self.y1))

            losses2 = tf.where(
                self.no_answer > 0,
                tf.multiply(na_flag2, tf.reduce_sum(reduced2, axis=1)),
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2,
                                                           labels=self.y2))

            #################################################
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
    def __init__(self, config, batch, word_mat=None,char_mat=None,  filter_sizes=None, embedding_size=None,num_filters=None,trainable=True, l2_reg_lambda=0.0, keep_prob=0.9, graph=None):

        # Placeholders for input, output and dropout
        self.config = config
        self.graph = graph if graph is not None else tf.Graph()
        self.trainable = trainable
        gru = cudnn_gru if config.use_cudnn else native_gru
        self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=True)
        if trainable == True:
            self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id,self.alternatives_tokens = batch.get_next()  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        else:
            self.input_x, self.input_x1, self.ch, self.qh,self.alternatives_tokens= batch.get_next()  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        self.dropout_keep_prob =keep_prob
        self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
                                           initializer=tf.constant_initializer(0), trainable=False)
        self.dropout = tf.placeholder_with_default(0.5, (), name="dropout")
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
        self.c_mask = tf.cast(self.input_x, tf.bool)  # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400)
        self.q_mask = tf.cast(self.input_x1, tf.bool)  # 同上(64,50)
        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)  # 每一个训练数据集实际长度
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)  # 每一个问题的实际长度
        self.ch_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1])
        self.qh_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1])
        # Embedding layer
        N, PL, QL, CL, d, dc,dg,nh= config.batch_size,config.para_limit,config.ques_limit,config.char_limit,\
                                    config.hidden, config.char_dim,config.char_hidden,config.num_heads
        with tf.variable_scope("Input_Embedding_Layer"):
            self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32),trainable=True)
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            cell_fw = tf.contrib.rnn.GRUCell(dg)  # 按照字符有多少个gru神经单元
            cell_bw = tf.contrib.rnn.GRUCell(dg)
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, ch_emb, self.ch_len,
                dtype=tf.float32)  # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because
            # char_hidden is 100 so state_fw and state_bw is [N * PL,100]
            ch_emb = tf.concat([state_fw, state_bw], axis=1)  # [N * PL,200]
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, qh_emb, self.qh_len,dtype=tf.float32)  # state_* [N*QL]
            qh_emb = tf.concat([state_fw, state_bw], axis=1)  # question_emd is [,200]

            qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])  # [batch_size,que_len,200]
            ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])  # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token
                                                   #这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg])
                                                    # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音
                                                    # 作为汉语的字符级别信息呢,可以尝试
            print(qh_emb,"llllllllllllll")
        with tf.name_scope("embedding"):

            self.W = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32),
                                            trainable=True)
            self.c_mask = tf.cast(self.input_x, tf.bool)  # self.c为填充之后的长度是一致的,用0进行填充
            self.q_mask = tf.cast(self.input_x1, tf.bool)
            if trainable:
                self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit,
            else:
                self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
            c_emb = tf.concat([self.embedded_chars, ch_emb], axis=2)
            q_emb= tf.concat([self.embedded_chars1, qh_emb], axis=2)
            # self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            # self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
        with tf.variable_scope("cnn_predict"):
            pooled_outputs = []
            c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None)  # 相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.q_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,  # Share the weights between passage and question
                               bias=False,
                               dropout=self.dropout)
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)  # 这个函数实现的是公式(4)中的所有
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)  # this is 公式(3) #[batch,c_maxlen,150]
            print(att,"111111111111111111111111")
            c_emb_expanded_shape=att.get_shape().as_list()
            c_emb_expanded=tf.expand_dims(att, -1)
            for i, filter_size in enumerate(filter_sizes):
                with tf.name_scope("conv-maxpool-%s" % filter_size):
                    # Convolution Layer
                    filter_shape = [filter_size,c_emb_expanded_shape[-1], 1, num_filters]
                    W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                    l2_loss += tf.nn.l2_loss(W)
                    b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                    l2_loss += tf.nn.l2_loss(b)
                    conv_ouput = tf.nn.conv2d(
                        c_emb_expanded,
                        W,
                        strides=[1, 1, 1, 1],
                        padding="VALID",
                        name="conv")
                    # Apply nonlinearity
                    h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu")
                    # Maxpooling over the outputs
                    pooled = tf.nn.max_pool(
                        h,
                        ksize=[1,  c_emb_expanded_shape[1]- filter_size + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name="pool")
                    print(pooled,"222222222222222222222")
                    pooled_outputs.append(pooled)

            # Combine all the pooled features
            num_filters_total = num_filters * len(filter_sizes)
            self.h_pool = tf.concat(pooled_outputs, 3)
            self.h_pool_flat_cnn = tf.reshape(self.h_pool, [-1, num_filters_total])
        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)  #input_size对应embedding的长度,此过程是初始化一个gru,双向lstm,包括他们的初始状态
            c = rnn(c_emb, seq_len=self.c_len) #上下文编码输出为batch ,c_maxlen,以及lstm输出长度 [batch_size,sequncen_length,150*3] num_layers is 3 so concat each layers
                                                    #each layer is 150 because each layers has back_forword and feed_forword(75+75)
            q = rnn(q_emb, seq_len=self.q_len) #问题编码
        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)  # 这个函数实现的是公式(4)中的所有公式
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)  # this is 公式(3) #[batch,c_maxlen,150]
        # Create a convolution + maxpool layer for each filter size
        input_shape=att.get_shape().as_list()
        print(att,"rrrr")
        att=tf.expand_dims(att,-1)
        print(att,"hhhhhhhhhhhh")
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, input_shape[-1], 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                l2_loss += tf.nn.l2_loss(W)
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                l2_loss += tf.nn.l2_loss(b)
                conv_ouput = tf.nn.conv2d(
                    att,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, config.para_limit - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                print(pooled,"3333333333333333333333333")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop_lstm = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            self.h_drop_cnn=tf.nn.dropout(self.h_pool_flat_cnn, self.dropout_keep_prob)
            self.h_drop=tf.concat([self.h_drop_lstm,self.h_drop_cnn],axis=-1)
        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total*2, 3],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        if trainable:
            with tf.name_scope("loss"):
                print(self.scores,self.input_y, "llllllllllllllll")
                losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
                self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
            # if config.decay is not None:
            #     self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            #     ema_op = self.var_ema.apply(tf.trainable_variables())
            #     with tf.control_dependencies([ema_op]):
            #         self.loss = tf.identity(self.loss)
            #
            #         self.assign_vars = []
            #         for var in tf.global_variables():
            #             v = self.var_ema.average(var)
            #             if v:
            #                 self.assign_vars.append(tf.assign(var, v))
            self.lr = tf.minimum(config.init_lr,
                                 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1))
            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
            grads = self.opt.compute_gradients(self.loss)
            gradients, variables = zip(*grads)
            capped_grads, _ = tf.clip_by_global_norm(
                gradients, config.grad_clip)
            self.train_op = self.opt.apply_gradients(
                zip(capped_grads, variables), global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
Exemplo n.º 11
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh, AL1,AL2,AL3= config.batch_size,self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads,self.aletr1_maxlen, \
                                               self.aletr2_maxlen,self.aletr3_maxlen

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.ch),
                [N * PL, CL, dc])  #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            self.alternati_emb1 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter1h),
                [N * AL1, CL, dc])  # (875, 25, 20)
            self.alternati_emb2 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter2h),
                [N * AL2, CL, dc])  # (768, 16, 300)
            self.alternati_emb3 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter3h),
                [N * AL3, CL, dc])  # (768, 16, 300)

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            alternati_emb1 = tf.nn.dropout(self.alternati_emb1,
                                           1.0 - 0.5 * self.dropout)
            alternati_emb2 = tf.nn.dropout(self.alternati_emb2,
                                           1.0 - 0.5 * self.dropout)
            alternati_emb3 = tf.nn.dropout(self.alternati_emb3,
                                           1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  #[batch,feature_len,d]
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)
            alternati_emb1 = conv(alternati_emb1,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)
            alternati_emb2 = conv(alternati_emb2,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)
            alternati_emb3 = conv(alternati_emb3,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)

            ch_emb = tf.reduce_max(
                ch_emb, axis=1)  #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            alternati_emb1 = tf.reduce_max(alternati_emb1, axis=1)
            alternati_emb2 = tf.reduce_max(alternati_emb2, axis=1)
            alternati_emb3 = tf.reduce_max(alternati_emb3, axis=1)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  #最终转变为句子长度对应的维度,
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]])
            alternati_emb1 = tf.reshape(alternati_emb1,
                                        [N, AL1, qh_emb.shape[-1]])
            alternati_emb2 = tf.reshape(alternati_emb2,
                                        [N, AL2, qh_emb.shape[-1]])
            alternati_emb3 = tf.reshape(alternati_emb3,
                                        [N, AL3, qh_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)
            alter_embedding1 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter1)  # 上下文
            alter_embedding2 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter2)  # 上下文
            alter_embedding3 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter3)  # 上下文

            c_emb = tf.concat(
                [c_emb, ch_emb],
                axis=2)  #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)
            alter_embedding1 = tf.concat([alter_embedding1, alternati_emb1],
                                         axis=2)
            alter_embedding2 = tf.concat([alter_embedding2, alternati_emb2],
                                         axis=2)
            alter_embedding3 = tf.concat([alter_embedding3, alternati_emb3],
                                         axis=2)

            c_emb = highway(
                c_emb,
                size=d,
                scope="highway",
                dropout=self.dropout,
                reuse=None)  #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            self.alter_embedding1 = c_emb

            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)
            alter_embedding1 = highway(alter_embedding1,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)
            alter_embedding2 = highway(alter_embedding2,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)
            alter_embedding3 = highway(alter_embedding3,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter1 = residual_block(
                alter_embedding1,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter1_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alterh1_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter2 = residual_block(
                alter_embedding2,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter2_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alter2_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter3 = residual_block(
                alter_embedding3,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter3_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alter3_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
        with tf.variable_scope('question_rnn'):
            self.gru = tf.contrib.rnn.GRUCell(d)
            initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32)
            output, state = tf.nn.dynamic_rnn(self.gru,
                                              q,
                                              initial_state=initstate)
            # self.qandc=tf.concat([self.q2c,self.c2q],axis=2)
            # self.qandc=dense(self.qandc,d)
            # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75)
            output1, state1 = tf.nn.dynamic_rnn(self.gru,
                                                alter1,
                                                initial_state=state)
            output2, state2 = tf.nn.dynamic_rnn(self.gru,
                                                alter2,
                                                initial_state=state)
            output3, state3 = tf.nn.dynamic_rnn(self.gru,
                                                alter3,
                                                initial_state=state)

            state = tf.expand_dims(state, axis=2)
            weight1 = tf.matmul(self.enc[1], state)
            weight2 = tf.matmul(self.enc[2], state)
            weight3 = tf.matmul(self.enc[3], state)

            weight_enc1 = tf.multiply(self.enc[1], weight1)
            weight_enc1 = tf.reduce_sum(weight_enc1, axis=1)

            weight_enc2 = tf.multiply(self.enc[2], weight2)
            weight_enc2 = tf.reduce_sum(weight_enc2, axis=1)

            weight_enc3 = tf.multiply(self.enc[3], weight3)
            weight_enc3 = tf.reduce_sum(weight_enc3, axis=1)

        with tf.variable_scope("Output_Layer"):
            # start_logits = tf.squeeze(
            #     conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1)
            # end_logits = tf.squeeze(
            #     conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1)
            # self.logits = [mask_logits(start_logits, mask=self.c_mask),
            #                mask_logits(end_logits, mask=self.c_mask)]
            #
            # logits1, logits2 = [l for l in self.logits]
            #
            # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
            #                   tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            similary1 = tf.expand_dims(self.cos_sine(weight_enc1, state1),
                                       axis=1)
            similary2 = tf.expand_dims(self.cos_sine(weight_enc2, state2),
                                       axis=1)
            similary3 = tf.expand_dims(self.cos_sine(weight_enc3, state3),
                                       axis=1)
            self.logits1 = tf.nn.softmax(
                tf.concat([similary1, similary2, similary3], axis=1))
            print(self.logits1, "lllllllllllllllllllllllllllllllllllll")
Exemplo n.º 12
0
    def _embed(self):
        with tf.device('/cpu:0'):
            word_pad_emb = tf.get_variable('word_pad_embedding',
                                           shape=(1,
                                                  self.word_vocab.embed_dim),
                                           initializer=tf.zeros_initializer,
                                           trainable=False)
            word_unk_emb = tf.get_variable('word_unk_embedding',
                                           shape=(1,
                                                  self.word_vocab.embed_dim),
                                           initializer=tf.zeros_initializer,
                                           trainable=True)
            word_emb_init = tf.constant_initializer(self.word_vocab.embeddings[2:]) \
                if self.word_vocab.embeddings is not None \
                else tf.random_normal_initializer()
            normal_word_embs = tf.get_variable(
                'normal_word_embeddings',
                shape=(self.word_vocab.size() - 2, self.word_vocab.embed_dim),
                initializer=word_emb_init,
                trainable=False)
            self.word_emb_mat = tf.concat(
                [word_pad_emb, word_unk_emb, normal_word_embs], 0)
            char_pad_emb = tf.get_variable('char_pad_embedding',
                                           shape=(1,
                                                  self.char_vocab.embed_dim),
                                           initializer=tf.zeros_initializer,
                                           trainable=False)
            char_emb_init = tf.constant_initializer(self.char_vocab.embeddings[1:]) \
                if self.char_vocab.embeddings is not None \
                else tf.random_normal_initializer()
            normal_char_embs = tf.get_variable(
                'normal_char_embeddings',
                shape=(self.char_vocab.size() - 1, self.char_vocab.embed_dim),
                initializer=char_emb_init,
                trainable=True)
            self.char_emb_mat = tf.concat([char_pad_emb, normal_char_embs], 0)
            self.emb_c = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_emb_mat, self.c),
                1.0 - self.dropout)
            self.emb_q = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_emb_mat, self.q),
                1.0 - self.dropout)

            self.emb_cc = tf.nn.dropout(
                tf.nn.embedding_lookup(self.char_emb_mat, self.cc),
                1.0 - 0.5 * self.dropout)
            self.emb_qc = tf.nn.dropout(
                tf.nn.embedding_lookup(self.char_emb_mat, self.qc),
                1.0 - 0.5 * self.dropout)

        # check the paper, it seems to use another operation
        # self.conv_emb_cc = conv(self.emb_cc, self.hidden_size, kernel_size=5, activation=tf.nn.relu, reuse=None)
        # self.conv_emb_qc = conv(self.emb_qc, self.hidden_size, kernel_size=5, activation=tf.nn.relu, reuse=True)
        self.conv_emb_cc = tf.reduce_max(self.emb_cc, 2)
        self.conv_emb_qc = tf.reduce_max(self.emb_qc, 2)
        self.conv_emb_cc = fc(self.conv_emb_cc,
                              self.char_vocab.embed_dim,
                              activation_fn=None)
        self.conv_emb_qc = fc(self.conv_emb_qc,
                              self.char_vocab.embed_dim,
                              activation_fn=None)

        self.emb_c = highway(tf.concat([self.emb_c, self.conv_emb_cc], axis=2),
                             size=self.hidden_size,
                             dropout=self.dropout,
                             num_layers=2,
                             scope='highway',
                             reuse=None)
        self.emb_q = highway(tf.concat([self.emb_q, self.conv_emb_qc], axis=2),
                             size=self.hidden_size,
                             dropout=self.dropout,
                             num_layers=2,
                             scope='highway',
                             reuse=True)
Exemplo n.º 13
0
    def forward(self, trainable):
        config = self.config
        N, PL, QL, CL, d, dc, nh= config.batch_size,self.c_maxlen, self.q_maxlen,\
                                               config.char_limit, config.hidden, config.char_dim, \
                                               config.num_heads,

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.ch),
                [N * PL, CL, dc])  #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  #[batch,feature_len,d]
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(
                ch_emb, axis=1)  #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  #最终转变为句子长度对应的维度,
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat(
                [c_emb, ch_emb],
                axis=2)  #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(
                c_emb,
                size=d,
                scope="highway",
                dropout=self.dropout,
                reuse=None)  #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
        with tf.variable_scope('question_rnn'):
            self.gru = tf.contrib.rnn.GRUCell(d)
            initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32)
            output, state = tf.nn.dynamic_rnn(self.gru,
                                              q,
                                              initial_state=initstate)
            # self.qandc=tf.concat([self.q2c,self.c2q],axis=2)
            # self.qandc=dense(self.qandc,d)
            # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75)

            state = tf.expand_dims(state, axis=2)
            weight1 = tf.matmul(self.enc[1], state)
            weight2 = tf.matmul(self.enc[2], state)
            weight3 = tf.matmul(self.enc[3], state)

            weight_enc1 = tf.multiply(self.enc[1], weight1)
            weight_enc1 = tf.reduce_sum(weight_enc1, axis=1)

            weight_enc2 = tf.multiply(self.enc[2], weight2)
            weight_enc2 = tf.reduce_sum(weight_enc2, axis=1)

            weight_enc3 = tf.multiply(self.enc[3], weight3)
            weight_enc3 = tf.reduce_sum(weight_enc3, axis=1)

        with tf.variable_scope("Output_Layer"):
            print(weight_enc1, "ggggggggggggggggg")
            inputs_shape = weight_enc1.get_shape().as_list()
            W = tf.get_variable(
                "W",
                shape=[inputs_shape[-1], 3],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            self.scores1 = tf.nn.xw_plus_b(weight_enc1, W, b, name="scores")
            self.scores2 = tf.nn.xw_plus_b(weight_enc2, W, b, name="scores")
            self.scores3 = tf.nn.xw_plus_b(weight_enc3, W, b, name="scores")
            self.scores = (self.scores1 + self.scores2 + self.scores3) / 3.0
            print(self.scores)
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            if trainable:
                with tf.name_scope("loss"):
                    print(self.scores, self.input_y, "llllllllllllllll")
                    losses = tf.nn.softmax_cross_entropy_with_logits(
                        logits=self.scores, labels=self.input_y)
                    self.loss = tf.reduce_mean(
                        losses) + self.l2_reg_lambda * self.l2_loss
                    # Accuracy
                with tf.name_scope("accuracy"):
                    correct_predictions = tf.equal(self.predictions,
                                                   tf.argmax(self.input_y, 1))
                    self.accuracy = tf.reduce_mean(tf.cast(
                        correct_predictions, "float"),
                                                   name="accuracy")
                # losses2 = tf.nn.softmax_cross_entropy_with_logits(
                #     logits=logits2, labels=self.y2)
                if config.decay is not None:
                    self.var_ema = tf.train.ExponentialMovingAverage(
                        config.decay)
                    ema_op = self.var_ema.apply(tf.trainable_variables())
                    with tf.control_dependencies([ema_op]):
                        self.loss = tf.identity(self.loss)

                        self.assign_vars = []
                        for var in tf.global_variables():
                            v = self.var_ema.average(var)
                            if v:
                                self.assign_vars.append(tf.assign(var, v))
                self.lr = tf.minimum(
                    config.init_lr, 0.001 / tf.log(999.) *
                    tf.log(tf.cast(self.global_step, tf.float32) + 1))
                self.opt = tf.train.AdamOptimizer(learning_rate=self.lr,
                                                  beta1=0.8,
                                                  beta2=0.999,
                                                  epsilon=1e-7)
                grads = self.opt.compute_gradients(self.loss)
                gradients, variables = zip(*grads)
                capped_grads, _ = tf.clip_by_global_norm(
                    gradients, config.grad_clip)
                self.train_op = self.opt.apply_gradients(
                    zip(capped_grads, variables), global_step=self.global_step)
                self.saver = tf.train.Saver(tf.global_variables(),
                                            max_to_keep=3)
Exemplo n.º 14
0
    def build_model(self):
        PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.contc_input),
                [-1, CL, dc])
            qh_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
                [-1, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.contw_input),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.quesw_input),
                1.0 - self.dropout)

            # if self.use_cove:
            #     c_emb_cove = self.cove_model(c_emb)
            #     q_emb_cove = self.cove_model(q_emb)
            #     c_emb = tf.concat([c_emb, c_emb_cove], axis=-1)
            #     q_emb = tf.concat([q_emb, q_emb_cove], axis=-1)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if self.use_elmo:
                c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.cont_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.ques_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,
                               bias=False,
                               dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_inputs = tf.concat(attention_outputs, axis=-1)
            enc = [conv(attention_inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout)
                enc.append(
                    residual_block(enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.cont_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.concat([enc[1], enc[2]], axis=-1)
            end_logits = tf.concat([enc[1], enc[3]], axis=-1)
            if self.use_elmo:
                start_logits = tf.concat((start_logits, self.cont_elmo),
                                         axis=-1)
                end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1)

            start_logits = tf.squeeze(
                conv(start_logits, 1, bias=False, name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(end_logits, 1, bias=False, name="end_pointer"), -1)
            unanswer_bias = tf.get_variable(
                "unanswer_bias", [1],
                regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7),
                initializer=tf.zeros_initializer())
            unanswer_bias = tf.reshape(
                tf.tile(unanswer_bias, [self.batch_size]), [-1, 1])
            self.logits1 = tf.concat(
                (unanswer_bias, mask_logits(start_logits, mask=self.c_mask)),
                axis=-1)
            self.logits2 = tf.concat(
                (unanswer_bias, mask_logits(end_logits, mask=self.c_mask)),
                axis=-1)
            start_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)
            if self.l2_norm is not None:
                variables = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                l2_loss = tf.contrib.layers.apply_regularization(
                    regularizer, variables)
                self.loss += l2_loss

            # output
            outer = tf.matmul(
                tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)
            self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) - 1
            self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) - 1

            if self.decay is not None:
                self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
                ema_op = self.var_ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)
                    self.assign_vars = []
                    for var in tf.global_variables():
                        v = self.var_ema.average(var)
                        if v is not None:
                            self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 15
0
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        CL = config.char_limit  # 16
        d = config.hidden  # 96
        dc = config.char_dim  # 64
        nh = config.num_heads  # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
            '''
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])  # (N*c_maxlen, 16, 64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])  # (N*q_maxlen, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)  # (N*q_maxlen, 16-5+1, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)  # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis=1)  # (N*q_maxlen, 96)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb,
                                [N, QL, ch_emb.shape[-1]])  # (N, q_maxlen, 96)
            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
            '''
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.c),
                                  1.0 - self.dropout)  # (N, c_maxlen, 300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.q),
                                  1.0 - self.dropout)  # (N, q_maxlen, 300)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)  # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)  # (N, q_maxlen, 396)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)  # (N, c_maxlen, 96)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)  # (N, q_maxlen, 96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1])
            Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            attention_outputs = [c, self.c2q, c * self.c2q]
            if config.q2c:
                attention_outputs.append(c * self.q2c)

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d,
                             name="input_projection")]  # d=hidden=96
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            '''
                tf.matrix_band_part: Copy a tensor setting everything outside a central band 
                                     in each innermost matrix to zero.
                self.enc[i]:  (N, c_maxlen, d)
                start_logits: (N, c_maxlen)
                end_logits:   (N, c_maxlen)
                logits1:      (N, c_maxlen)
                logits2:      (N, c_maxlen)
                outer:        (N, c_maxlen, c_maxlen)
                yp1, yp2, losses, losses2: (N,)  
            '''
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

            # find max-score span
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            #DEBUG
            self.debug_ops.extend([
                self.enc[1], start_logits, end_logits, logits1, logits2, outer,
                self.yp1, self.yp2, losses, losses2, self.loss
            ])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
Exemplo n.º 16
0
    def build_model(self):
        PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None)
            qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if self.use_elmo:
                c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1)

            c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None)
            q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.cont_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.ques_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,
                               bias=False,
                               dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_inputs = tf.concat(attention_outputs, axis=-1)
            enc = [conv(attention_inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout)
                enc.append(residual_block(enc[i],
                                          num_blocks=7,
                                          num_conv_layers=2,
                                          kernel_size=5,
                                          mask=self.c_mask,
                                          num_filters=d,
                                          num_heads=nh,
                                          seq_len=self.cont_len,
                                          scope="Model_Encoder",
                                          bias=False,
                                          reuse=True if i > 0 else None,
                                          dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.concat([enc[1], enc[2]], axis=-1)
            end_logits = tf.concat([enc[1], enc[3]], axis=-1)
            if self.use_elmo:
                start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1)
                end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1)
            start_logits = tf.squeeze(conv(start_logits, 1, bias=False, name="start_pointer"), -1)
            end_logits = tf.squeeze(conv(end_logits, 1, bias=False, name="end_pointer"), -1)
            # 2.0 Dataset
            # unanswer_bias = tf.get_variable("unanswer_bias", [1],
            #                                 regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7),
            #                                 initializer=tf.zeros_initializer())
            # unanswer_bias = tf.reshape(tf.tile(unanswer_bias, [self.batch_size]), [-1, 1])
            # self.logits1 = tf.concat((unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1)
            # self.logits2 = tf.concat((unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1)
            self.logits1 = mask_logits(start_logits, mask=self.c_mask)
            self.logits2 = mask_logits(end_logits, mask=self.c_mask)
            start_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)

            # output
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)
            self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

        if self.use_topk:
            with tf.variable_scope("Topk_Layer"):
                top_size = 3
                outer = tf.reshape(outer, [self.batch_size, -1])
                outer_inds = tf.nn.top_k(outer, top_size).indices  # [N,top_size]
                self.yp1 = outer_inds // tf.shape(self.logits1)[-1]
                self.yp2 = outer_inds % tf.shape(self.logits2)[-1]

                def sen_mask(tensor):
                    def sen_mask_(a, b, filters):
                        try:
                            mata = tf.zeros([a, filters], tf.int32)
                        except:
                            mata = []
                        matb = tf.ones([b - a, filters], tf.int32)
                        matc = tf.zeros([tf.shape(self.logits1)[-1] - b, filters], tf.int32)
                        mat = tf.concat((mata, matb, matc), axis=0)
                        return mat

                    return tf.map_fn(lambda x: sen_mask_(x[0], x[1], self.filters), tensor)

                self.yp3 = self.yp2 + 1
                self.yp1 = tf.expand_dims(self.yp1, -1)
                self.yp2 = tf.expand_dims(self.yp2, -1)
                self.yp3 = tf.expand_dims(self.yp3, -1)
                self.y_mask = tf.concat([self.yp1, self.yp3], axis=-1)
                self.y_mask = tf.map_fn(lambda x: sen_mask(x), self.y_mask)

                # answer
                c = tf.tile(tf.expand_dims(c2q, 1), [1, top_size, 1, 1])
                c_topk = tf.multiply(tf.cast(self.y_mask, tf.float32), c)
                W1 = tf.get_variable("W1", initializer=tf.ones([1, 1, 1, self.filters]))
                W1 = tf.tile(W1, [self.batch_size, top_size, 1, 1])
                alpha1 = tf.nn.softmax(tf.matmul(W1, c_topk, transpose_b=True), axis=2)
                answer = tf.matmul(alpha1, c_topk)  # [32,top_size,1,128]

                # question
                W2 = tf.get_variable("W2", initializer=tf.ones([1, 1, self.filters]))
                W2 = tf.tile(W2, [self.batch_size, 1, 1])
                alpha2 = tf.nn.softmax(tf.matmul(W2, q, transpose_b=True), axis=1)
                ques = tf.matmul(alpha2, q)
                ques = tf.tile(tf.expand_dims(ques, 1), [1, top_size, 1, 1])  # [32,top_size,1,128]

                # question & answer
                W3 = tf.get_variable("W3", initializer=tf.ones([1, 1, self.filters, self.filters]))
                W3 = tf.tile(W3, [self.batch_size, top_size, 1, 1])
                y_topk_logits = tf.nn.sigmoid(tf.matmul(ques, tf.matmul(W3, answer, transpose_b=True))) # [32,top_size,1,1]
                y_topk_logits = tf.squeeze(y_topk_logits)  # [32,top_size]

                self.yp1 = tf.squeeze(self.yp1)
                self.yp2 = tf.squeeze(self.yp2)
                coeff1_topk = tf.one_hot(self.yp1, self.c_maxlen, axis=-1) # [32,top_size,400] one-hot
                coeff2_topk = tf.one_hot(self.yp2, self.c_maxlen, axis=-1)
                # [0,1,0,0,0][0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+[0,0,0,1,0]->[0,1,1,1,0]
                coeff1_topk_cumsum = tf.cumsum(coeff1_topk, axis=-1)
                coeff2_topk_cumsum = tf.cumsum(coeff2_topk, axis=-1)
                self.y_d = coeff1_topk_cumsum - coeff2_topk_cumsum + coeff2_topk # [32, top_size, 400]

                def clip_for_sigmoid(output):
                    _epsilon = tf.convert_to_tensor(1e-7, dtype=output.dtype.base_dtype)
                    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
                    output = tf.log(output / (1 - output))
                    return output

                if self.topk_loss=='f1':
                    # f1 loss
                    y_start_ind = tf.cumsum(self.y_start, axis=-1)
                    y_end_ind = tf.cumsum(self.y_end, axis=-1)
                    y_gtd = y_start_ind - y_end_ind + self.y_end # [32, 400]
                    def cal_num_same(y_pred, y_truth): # [top_size, 400] [400,]
                        def cal_num_same_(y_pred_, y_truth): # [400,] [400,]
                            return tf.reduce_sum(tf.cast(tf.logical_and(tf.cast(y_pred_, tf.bool), tf.cast(y_truth, tf.bool)), tf.float32),axis=-1)
                        return [tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred),tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred)]
                    num_same = tf.map_fn(lambda x:cal_num_same(x[0], x[1]), [self.y_d, y_gtd])[0] # [32, top_size]
                    y_precision = num_same / (tf.cast(tf.reduce_sum(self.y_d, axis=-1),tf.float32) + 1e-8) # [32, top_size]
                    y_recall = num_same / tf.expand_dims(tf.cast(tf.reduce_sum(y_gtd, axis=-1),tf.float32) + 1e-8, axis=-1) # [32, top_size]
                    y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall,tf.float32) + 1e-8) # [32, top_size]
                    topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_f1))

                elif self.topk_loss=='em':
                    # em loss
                    start_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_start, axis=-1), axis=1), tf.int32),
                                        tf.cast(self.yp1, tf.int32))  # [32, top_size]
                    end_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_end, axis=-1), axis=1), tf.int32),
                                      tf.cast(self.yp2, tf.int32))  # [32, top_size]
                    y_em = tf.cast(tf.logical_and(start_em, end_em), tf.float32) # [32, top_size]
                    topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_em))

                # final loss
                self.Lambda1 = tf.get_variable("Lambda1", initializer=tf.constant([0.9]), trainable=False)
                self.loss = tf.reduce_mean(self.Lambda1 * (start_loss + end_loss) + (1 - self.Lambda1) * topk_loss)

                # output
                outer_topk = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                                  tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
                outer_topk = tf.matrix_band_part(outer_topk, 0, self.ans_limit)
                self.output1 = tf.argmax(tf.reduce_max(outer_topk, axis=2), axis=1)
                self.output2 = tf.argmax(tf.reduce_max(outer_topk, axis=1), axis=1)

                # diversity loss
                if self.diversity_loss:
                    self.Lambda2 = tf.get_variable("Lambda2", initializer=tf.constant([0.1]),trainable=False)
                    diversity_loss = tf.reduce_mean(tf.reduce_prod(self.y_d, axis=1),axis=-1) # [32,top_size,400]->[32,400]->[32,]
                    self.loss = self.loss + tf.reduce_mean(self.Lambda2 * diversity_loss)


        if self.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if self.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)
                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v is not None:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 17
0
    def BuildModel(self):
        print("preprocessing build model....")
        # word embedding
        self.utterance_ph = tf.placeholder(tf.int32,
                                           shape=(None, self.max_num_utterance,
                                                  self.max_sentence_len))
        self.response_ph = tf.placeholder(tf.int32,
                                          shape=(None, self.max_sentence_len))
        self.y_true = tf.placeholder(tf.int32, shape=(None, ))
        self.embedding_ph = tf.placeholder(tf.float32,
                                           shape=(self.total_words,
                                                  self.word_embedding_size))
        self.response_len = tf.placeholder(tf.int32, shape=(None, ))
        self.all_utterance_len_ph = tf.placeholder(
            tf.int32, shape=(None, self.max_num_utterance))
        word_embeddings = tf.get_variable('word_embeddings_v',
                                          shape=(self.total_words,
                                                 self.word_embedding_size),
                                          dtype=tf.float32,
                                          trainable=False)
        self.embedding_init = word_embeddings.assign(self.embedding_ph)
        all_utterance_embeddings = tf.nn.embedding_lookup(
            word_embeddings, self.utterance_ph)
        response_embeddings = tf.nn.embedding_lookup(word_embeddings,
                                                     self.response_ph)
        sentence_GRU = tf.nn.rnn_cell.GRUCell(
            self.rnn_units, kernel_initializer=tf.orthogonal_initializer())
        all_utterance_embeddings = tf.unstack(all_utterance_embeddings,
                                              num=self.max_num_utterance,
                                              axis=1)
        all_utterance_len = tf.unstack(self.all_utterance_len_ph,
                                       num=self.max_num_utterance,
                                       axis=1)

        # char embedding
        self.response_cph = tf.placeholder(tf.int32,
                                           shape=(None, self.max_sentence_len,
                                                  self.wordlen))
        self.embedding_cph = tf.placeholder(tf.float32,
                                            shape=(self.total_chars,
                                                   self.char_embedding_dim))
        char_embeddings = tf.get_variable('char_embeddings_v',
                                          shape=(self.total_chars,
                                                 self.char_embedding_dim),
                                          dtype=tf.float32,
                                          trainable=False)
        self.char_embeddings_init = char_embeddings.assign(self.embedding_cph)
        response_char_embeddings = tf.nn.embedding_lookup(
            char_embeddings, self.response_cph)

        self.utterance_cph = tf.placeholder(
            tf.int32,
            shape=(None, self.max_num_utterance, self.max_sentence_len,
                   self.wordlen))
        all_utterance_ch_embeddings = tf.nn.embedding_lookup(
            char_embeddings, self.utterance_cph)
        all_utterance_ch_embeddings = tf.unstack(all_utterance_ch_embeddings,
                                                 num=self.max_num_utterance,
                                                 axis=1)

        # response : char _ word embedding
        self.N = tf.placeholder(tf.int32, shape=(None))
        d = 96
        dro = 0.1
        self.sample_numbers = tf.placeholder(tf.int32, shape=(None))
        # 2 means (nagetive_samples + 1)
        ch_emb = tf.reshape(response_char_embeddings, [
            self.sample_numbers * self.N * self.max_sentence_len, self.wordlen,
            self.char_embedding_dim
        ])
        self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
        ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
        ch_emb = conv(ch_emb,
                      d,
                      bias=True,
                      activation=tf.nn.relu,
                      kernel_size=5,
                      name="char_conv",
                      reuse=None)
        ch_emb = tf.reduce_max(ch_emb, axis=1)
        ch_emb = tf.reshape(ch_emb, [
            self.sample_numbers * self.N, self.max_sentence_len,
            int(ch_emb.shape[-1])
        ])

        c_emb = tf.nn.dropout(response_embeddings, 1.0 - self.dropout)
        # c_emb = tf.concat([c_emb, ch_emb], axis=2)
        c_emb = highway(c_emb,
                        size=d,
                        scope="highway",
                        dropout=self.dropout,
                        reuse=None)

        # chamge

        A_matrix = tf.get_variable(
            'A_matrix_v',
            shape=(self.rnn_units, self.rnn_units),
            initializer=tf.contrib.layers.xavier_initializer(),
            dtype=tf.float32)
        final_GRU = tf.nn.rnn_cell.GRUCell(
            self.rnn_units, kernel_initializer=tf.orthogonal_initializer())
        reuse = None

        response_GRU_embeddings, _ = tf.nn.dynamic_rnn(
            sentence_GRU,
            c_emb,
            sequence_length=self.response_len,
            dtype=tf.float32,
            scope='sentence_GRU')
        self.response_embedding_save = response_GRU_embeddings
        c_emb = tf.transpose(c_emb, perm=[0, 2, 1])
        response_GRU_embeddings = tf.transpose(response_GRU_embeddings,
                                               perm=[0, 2, 1])
        matching_vectors = []
        linecounter = 0
        for utterance_embeddings, utterance_len in zip(
                all_utterance_embeddings, all_utterance_len):
            #  utterance embedding
            utt_ch_emb = tf.reshape(all_utterance_ch_embeddings[linecounter], [
                self.sample_numbers * self.N * self.max_sentence_len,
                self.wordlen, self.char_embedding_dim
            ])
            utt_ch_emb = tf.nn.dropout(utt_ch_emb, 1.0 - 0.5 * self.dropout)
            utt_ch_emb = conv(utt_ch_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=True)
            utt_ch_emb = tf.reduce_max(utt_ch_emb, axis=1)
            utt_ch_emb = tf.reshape(utt_ch_emb, [
                self.sample_numbers * self.N, self.max_sentence_len,
                int(utt_ch_emb.shape[-1])
            ])

            utt_emb = tf.nn.dropout(utterance_embeddings, 1.0 - self.dropout)
            # utt_emb = tf.concat([utt_emb, utt_ch_emb], axis=2)
            utt_emb = highway(utt_emb,
                              size=d,
                              scope="highway",
                              dropout=self.dropout,
                              reuse=True)

            matrix1 = tf.matmul(utt_emb, c_emb)
            utterance_GRU_embeddings, _ = tf.nn.dynamic_rnn(
                sentence_GRU,
                utt_emb,
                sequence_length=utterance_len,
                dtype=tf.float32,
                scope='sentence_GRU')
            matrix2 = tf.einsum('aij,jk->aik', utterance_GRU_embeddings,
                                A_matrix)  # TODO:check this
            matrix2 = tf.matmul(matrix2, response_GRU_embeddings)
            matrix = tf.stack([matrix1, matrix2], axis=3, name='matrix_stack')
            conv_layer = tf.layers.conv2d(
                matrix,
                filters=8,
                kernel_size=(3, 3),
                padding='VALID',
                kernel_initializer=tf.contrib.keras.initializers.he_normal(),
                activation=tf.nn.relu,
                reuse=reuse,
                name='conv')  # TODO: check other params
            pooling_layer = tf.layers.max_pooling2d(
                conv_layer, (3, 3),
                strides=(3, 3),
                padding='VALID',
                name='max_pooling')  # TODO: check other params
            matching_vector = tf.layers.dense(
                tf.contrib.layers.flatten(pooling_layer),
                50,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                activation=tf.tanh,
                reuse=reuse,
                name='matching_v')  # TODO: check wthether this is correct
            if not reuse:
                reuse = True
            matching_vectors.append(matching_vector)
            linecounter += 1

        _, last_hidden = tf.nn.dynamic_rnn(
            final_GRU,
            tf.stack(matching_vectors, axis=0, name='matching_stack'),
            dtype=tf.float32,
            time_major=True,
            scope='final_GRU')  # TODO: check time_major
        logits = tf.layers.dense(
            last_hidden,
            2,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name='final_v')
        self.y_pred = tf.nn.softmax(logits)
        self.total_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y_true,
                                                           logits=logits))
        tf.summary.scalar('loss', self.total_loss)
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        self.train_op = optimizer.minimize(self.total_loss)
Exemplo n.º 18
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \
                                config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=2,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=2,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=2,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Decoder_Layer"):
            memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]],
                               axis=-1)
            oups = tf.split(self.a, [1] * self.a_maxlen, 1)
            h = tf.tanh(
                _linear(tf.reduce_mean(memory, axis=1),
                        output_size=d,
                        bias=False,
                        scope="h_initial"))
            c = tf.tanh(
                _linear(tf.reduce_mean(memory, axis=1),
                        output_size=d,
                        bias=False,
                        scope="c_initial"))
            state = (c, h)
            outputs = []
            prev = None
            prev_probs = [0.0]
            symbols = []
            for i, inp in enumerate(oups):
                einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp),
                                  [N, dw])
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                if self.loop_function is not None and prev is not None:
                    with tf.variable_scope("loop_function", reuse=True):
                        einp, prev_probs, index, prev_symbol = self.loop_function(
                            prev, prev_probs, self.beam_size, i)
                        h = tf.gather(h, index)  # update prev state
                        state = tuple(tf.gather(s, index)
                                      for s in state)  # update prev state
                        for j, symbol in enumerate(symbols):
                            symbols[j] = tf.gather(
                                symbol, index)  # update prev symbols
                        for j, output in enumerate(outputs):
                            outputs[j] = tf.gather(
                                output, index)  # update prev outputs
                        symbols.append(prev_symbol)

                attn = tf.reshape(
                    multihead_attention(tf.expand_dims(h, 1),
                                        units=d,
                                        num_heads=nh,
                                        memory=memory,
                                        mask=self.c_mask,
                                        bias=False), [-1, nh * d])

                cinp = tf.concat([einp, attn], 1)
                h, state = self.cell(cinp, state)

                with tf.variable_scope("AttnOutputProjection"):
                    output = _linear([h] + [cinp],
                                     output_size=dw * 2,
                                     bias=False,
                                     scope="output")
                    output = tf.reshape(output, [-1, dw, 2])
                    output = tf.reduce_max(output, 2)  # maxout
                    outputs.append(output)

                if self.loop_function is not None:
                    prev = output

            if self.loop_function is not None:
                # process the last symbol
                einp, prev_probs, index, prev_symbol = self.loop_function(
                    prev, prev_probs, self.beam_size, i + 1)
                for j, symbol in enumerate(symbols):
                    symbols[j] = tf.gather(symbol,
                                           index)  # update prev symbols
                for j, output in enumerate(outputs):
                    outputs[j] = tf.gather(output,
                                           index)  # update prev outputs
                symbols.append(prev_symbol)

                # output the final best result of beam search
                for k, symbol in enumerate(symbols):
                    symbols[k] = tf.gather(symbol, 0)
                for k, output in enumerate(outputs):
                    outputs[k] = tf.expand_dims(tf.gather(output, 0), 0)

            self.gen_loss = self._compute_loss(outputs, oups, N)
            self.symbols = symbols

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        self.loss = self.gen_loss

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 19
0
    def _embed(self):
        with tf.variable_scope('word_char_embedding'):

            if self.config.fix_pretrained_vector:
                self.pretrained_word_mat = tf.get_variable(
                    "word_emb_mat",
                    [self.vocab.word_size() - 2, self.vocab.word_embed_dim],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(
                        self.vocab.word_embeddings[2:], dtype=tf.float32),
                    trainable=False)
                self.word_pad_unk_mat = tf.get_variable(
                    "word_unk_pad",
                    [2, self.pretrained_word_mat.get_shape()[1]],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(
                        self.vocab.word_embeddings[:2], dtype=tf.float32),
                    trainable=True)

                self.word_mat = tf.concat(
                    [self.word_pad_unk_mat, self.pretrained_word_mat], axis=0)

                self.pretrained_char_mat = tf.get_variable(
                    "char_emb_mat",
                    [self.vocab.char_size() - 2, self.vocab.char_embed_dim],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(
                        self.vocab.char_embeddings[2:], dtype=tf.float32),
                    trainable=False)
                self.char_pad_unk_mat = tf.get_variable(
                    "char_unk_pad",
                    [2, self.pretrained_char_mat.get_shape()[1]],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(
                        self.vocab.char_embeddings[:2], dtype=tf.float32),
                    trainable=True)

                self.char_mat = tf.concat(
                    [self.char_pad_unk_mat, self.pretrained_char_mat], axis=0)

            else:
                self.word_mat = tf.get_variable(
                    'word_embeddings',
                    shape=[self.vocab.word_size(), self.vocab.word_embed_dim],
                    initializer=tf.constant_initializer(
                        self.vocab.word_embeddings),
                    trainable=True)

                self.char_mat = tf.get_variable(
                    'char_embeddings',
                    shape=[self.vocab.char_size(), self.vocab.char_embed_dim],
                    initializer=tf.constant_initializer(
                        self.vocab.char_embeddings),
                    trainable=True)

            self.ch_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32),
                              axis=2), [-1])
            self.qh_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32),
                              axis=2), [-1])

        N, PL, QL, CL, d, dc, nh = self._params()

        if self.config.fix_pretrained_vector:
            dc = self.char_mat.get_shape()[-1]
        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL * self.max_p_num, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL * self.max_p_num, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N * self.max_p_num, PL, -1])
            qh_emb = tf.reshape(qh_emb, [N * self.max_p_num, QL, -1])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            self.c_emb = highway(c_emb,
                                 size=d,
                                 scope="highway",
                                 dropout=self.dropout,
                                 reuse=None)
            self.q_emb = highway(q_emb,
                                 size=d,
                                 scope="highway",
                                 dropout=self.dropout,
                                 reuse=True)
Exemplo n.º 20
0
    def __init__(self,
                 config,
                 batch,
                 word_mat=None,
                 char_mat=None,
                 filter_sizes=None,
                 embedding_size=None,
                 num_filters=None,
                 trainable=True,
                 l2_reg_lambda=0.0,
                 keep_prob=0.9,
                 graph=None):

        # Placeholders for input, output and dropout
        self.config = config
        self.graph = graph if graph is not None else tf.Graph()
        self.trainable = trainable
        if trainable == True:
            self.input_x, self.input_x1, self.ch, self.qh, self.input_y, self.qa_id, self.alternatives_tokens = batch.get_next(
            )  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        else:
            self.input_x, self.input_x1, self.ch, self.qh, self.alternatives_tokens = batch.get_next(
            )  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        self.dropout_keep_prob = keep_prob
        self.global_step = tf.get_variable(
            'global_step',
            shape=[],
            dtype=tf.int32,
            initializer=tf.constant_initializer(0),
            trainable=False)
        self.dropout = tf.placeholder_with_default(0.5, (), name="dropout")
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
        # Embedding layer
        with tf.name_scope("embedding"):
            self.char_mat = tf.get_variable("char_mat",
                                            initializer=tf.constant(
                                                char_mat, dtype=tf.float32),
                                            trainable=True)
            self.W = tf.get_variable("word_mat",
                                     initializer=tf.constant(word_mat,
                                                             dtype=tf.float32),
                                     trainable=True)
            self.c_mask = tf.cast(self.input_x,
                                  tf.bool)  # self.c为填充之后的长度是一致的,用0进行填充
            self.q_mask = tf.cast(self.input_x1, tf.bool)
            if trainable:
                self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit,
            else:
                self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit
            self.ch_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32),
                              axis=2), [-1])
            self.qh_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32),
                              axis=2), [-1])
            N, PL, QL, CL, d, dc, nh,dg = config.batch_size, self.c_maxlen, self.q_maxlen,\
                                                      config.char_limit, config.hidden, config.char_dim, \
                                                      config.num_heads,config.char_hidden
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            with tf.variable_scope("cnn_char_Embedding_Layer"):
                # Bidaf style conv-highway encoder
                ch_emb_cnn = conv(ch_emb,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=None)
                qh_emb_cnn = conv(qh_emb,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)

                ch_emb_cnn = tf.reduce_max(ch_emb_cnn,
                                           axis=1)  # 求出横向唯独的最大特征,这里可以用k_max尝试
                qh_emb_cnn = tf.reduce_max(qh_emb_cnn, axis=1)

                ch_emb_cnn = tf.reshape(ch_emb_cnn,
                                        [N, PL, ch_emb_cnn.shape[-1]])
                qh_emb_cnn = tf.reshape(qh_emb_cnn,
                                        [N, QL, qh_emb_cnn.shape[-1]])
            with tf.variable_scope('lstm_char_embedding'):
                cell_fw = tf.contrib.rnn.GRUCell(dg)  # 按照字符有多少个gru神经单元
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32
                )  # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because
                # char_hidden is 100 so state_fw and state_bw is [N * PL,100]
                ch_emb_lstm = tf.concat([state_fw, state_bw],
                                        axis=1)  # [N * PL,200]
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len,
                    dtype=tf.float32)  # state_* [N*QL]
                qh_emb_lstm = tf.concat([state_fw, state_bw],
                                        axis=1)  # question_emd is [,200]

                qh_emb_lstm = tf.reshape(
                    qh_emb_lstm, [N, QL, 2 * dg])  # [batch_size,que_len,200]
                ch_emb_lstm = tf.reshape(
                    ch_emb_lstm, [N, PL, 2 * dg]
                )  # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token
                # 这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg])
                # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音
                # 作为汉语的字符级别信息呢,可以尝试
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars1 = tf.nn.embedding_lookup(
                self.W, self.input_x1)
            ch_emb_cnn = tf.nn.dropout(ch_emb_cnn, self.dropout)
            ch_emb_lstm = tf.nn.dropout(ch_emb_lstm, self.dropout)
            qh_emb_cnn = tf.nn.dropout(qh_emb_cnn, self.dropout)
            qh_emb_lstm = tf.nn.dropout(qh_emb_lstm, self.dropout)
            with tf.variable_scope("lstm_output"):
                c_emb = tf.concat([self.embedded_chars, ch_emb_lstm], axis=2)
                q_emb = tf.concat([self.embedded_chars1, qh_emb_lstm], axis=2)
                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)  # 相当于对信息进行一次筛选并且让表示的维度降低到75
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)
                self.embedded_chars_expanded = tf.expand_dims(c_emb, -1)
                self.embedded_chars_expanded1 = tf.expand_dims(q_emb, -1)
                # Create a convolution + maxpool layer for each filter size
                input_shape = c_emb.get_shape().as_list()
                pooled_outputs = []
                for i, filter_size in enumerate(filter_sizes):
                    with tf.name_scope("conv-maxpool-%s" % filter_size):
                        # Convolution Layer
                        filter_shape = [
                            filter_size, input_shape[-1], 1, num_filters
                        ]
                        W = tf.Variable(tf.truncated_normal(filter_shape,
                                                            stddev=0.1),
                                        name="W")
                        b = tf.Variable(tf.constant(0.1, shape=[num_filters]),
                                        name="b")
                        l2_loss += tf.nn.l2_loss(W)
                        l2_loss += tf.nn.l2_loss(b)
                        conv_ouput = tf.nn.conv2d(self.embedded_chars_expanded,
                                                  W,
                                                  strides=[1, 1, 1, 1],
                                                  padding="VALID",
                                                  name="conv")
                        # Apply nonlinearity
                        h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b),
                                       name="relu")
                        # Maxpooling over the outputs
                        pooled = tf.nn.max_pool(h,
                                                ksize=[
                                                    1, config.para_limit -
                                                    filter_size + 1, 1, 1
                                                ],
                                                strides=[1, 1, 1, 1],
                                                padding='VALID',
                                                name="pool")
                        pooled_outputs.append(pooled)

                # Combine all the pooled features
                num_filters_total = num_filters * len(filter_sizes)
                self.h_pool = tf.concat(pooled_outputs, 3)
                self.h_pool_flat = tf.reshape(self.h_pool,
                                              [-1, num_filters_total])

                # Add dropout
                with tf.name_scope("dropout"):
                    self.h_drop = tf.nn.dropout(self.h_pool_flat,
                                                self.dropout_keep_prob)

                # Final (unnormalized) scores and predictions
                with tf.name_scope("output"):
                    W = tf.get_variable(
                        "W",
                        shape=[num_filters_total, 3],
                        initializer=tf.contrib.layers.xavier_initializer())
                    b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
                    l2_loss += tf.nn.l2_loss(W)
                    l2_loss += tf.nn.l2_loss(b)
                    self.lstm_scores = tf.nn.xw_plus_b(self.h_drop,
                                                       W,
                                                       b,
                                                       name="scores")
            with tf.variable_scope("cnn_output"):
                c_emb = tf.concat([self.embedded_chars, ch_emb_cnn], axis=2)
                q_emb = tf.concat([self.embedded_chars1, qh_emb_cnn], axis=2)
                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)  # 相当于对信息进行一次筛选并且让表示的维度降低到75
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)
                self.embedded_chars_expanded = tf.expand_dims(c_emb, -1)
                self.embedded_chars_expanded1 = tf.expand_dims(q_emb, -1)
                # Create a convolution + maxpool layer for each filter size
                input_shape = c_emb.get_shape().as_list()
                pooled_outputs = []
                for i, filter_size in enumerate(filter_sizes):
                    with tf.name_scope("conv-maxpool-%s" % filter_size):
                        # Convolution Layer
                        filter_shape = [
                            filter_size, input_shape[-1], 1, num_filters
                        ]
                        W = tf.Variable(tf.truncated_normal(filter_shape,
                                                            stddev=0.1),
                                        name="W")
                        b = tf.Variable(tf.constant(0.1, shape=[num_filters]),
                                        name="b")
                        l2_loss += tf.nn.l2_loss(W)
                        l2_loss += tf.nn.l2_loss(b)
                        conv_ouput = tf.nn.conv2d(self.embedded_chars_expanded,
                                                  W,
                                                  strides=[1, 1, 1, 1],
                                                  padding="VALID",
                                                  name="conv")
                        # Apply nonlinearity
                        h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b),
                                       name="relu")
                        # Maxpooling over the outputs
                        pooled = tf.nn.max_pool(h,
                                                ksize=[
                                                    1, config.para_limit -
                                                    filter_size + 1, 1, 1
                                                ],
                                                strides=[1, 1, 1, 1],
                                                padding='VALID',
                                                name="pool")
                        pooled_outputs.append(pooled)

                # Combine all the pooled features
                num_filters_total = num_filters * len(filter_sizes)
                self.h_pool = tf.concat(pooled_outputs, 3)
                self.h_pool_flat = tf.reshape(self.h_pool,
                                              [-1, num_filters_total])

                # Add dropout
                with tf.name_scope("dropout"):
                    self.h_drop = tf.nn.dropout(self.h_pool_flat,
                                                self.dropout_keep_prob)

                # Final (unnormalized) scores and predictions
                with tf.name_scope("output"):
                    W = tf.get_variable(
                        "W",
                        shape=[num_filters_total, 3],
                        initializer=tf.contrib.layers.xavier_initializer())
                    b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
                    l2_loss += tf.nn.l2_loss(W)
                    l2_loss += tf.nn.l2_loss(b)
                    self.cnn_scores = tf.nn.xw_plus_b(self.h_drop,
                                                      W,
                                                      b,
                                                      name="scores")
        self.scores = tf.add(self.lstm_scores, self.cnn_scores) / 2.0
        print(self.scores)
        print(self.lstm_scores)
        print("3333333333333333333333333")
        self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        if trainable:
            with tf.name_scope("loss"):
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.scores, labels=self.input_y)
                self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions,
                                               tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
            if config.decay is not None:
                self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
                ema_op = self.var_ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)

                    self.assign_vars = []
                    for var in tf.global_variables():
                        v = self.var_ema.average(var)
                        if v:
                            self.assign_vars.append(tf.assign(var, v))
            self.lr = tf.minimum(
                config.init_lr, 0.001 / tf.log(999.) *
                tf.log(tf.cast(self.global_step, tf.float32) + 1))
            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr,
                                              beta1=0.8,
                                              beta2=0.999,
                                              epsilon=1e-7)
            grads = self.opt.compute_gradients(self.loss)
            gradients, variables = zip(*grads)
            capped_grads, _ = tf.clip_by_global_norm(gradients,
                                                     config.grad_clip)
            self.train_op = self.opt.apply_gradients(
                zip(capped_grads, variables), global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
Exemplo n.º 21
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, \
            self.q_maxlen, config.char_limit, config.hidden, config.tw_char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            if config.type == "all":
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
                qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

                # Bidaf style conv-highway encoder
                ch_emb = conv(ch_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=None)
                qh_emb = conv(qh_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=True)

                ch_emb = tf.reduce_max(ch_emb, axis=1)
                qh_emb = tf.reduce_max(qh_emb, axis=1)

                ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
                qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

                c_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.word_mat, self.c),
                    1.0 - self.dropout)
                q_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.word_mat, self.q),
                    1.0 - self.dropout)

                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)

            elif config.type == 'char':
                c_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.char_mat, self.c),
                    1.0 - self.dropout)
                q_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.char_mat, self.q),
                    1.0 - self.dropout)

                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            # guess : mask the padding part pad in the end of the passage
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                                labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Exemplo n.º 22
0
    def forward(self):
        self.c_words = tf.placeholder(tf.int32,
                                      [None, self.config.context_len],
                                      'context-words')
        self.c_chars = tf.placeholder(
            tf.int32,
            [None, self.config.context_len, self.config.max_char_len],
            'context-chars')
        self.c_mask = tf.sign(self.c_words)

        self.q_words = tf.placeholder(tf.int32,
                                      [None, self.config.question_len],
                                      'query-words')
        self.q_chars = tf.placeholder(
            tf.int32,
            [None, self.config.question_len, self.config.max_char_len],
            'query-chars')
        self.q_mask = tf.sign(self.q_words)

        self.c_len = tf.cast(tf.reduce_sum(self.c_mask, -1), tf.int32)
        self.q_len = tf.cast(tf.reduce_sum(self.q_mask, -1), tf.int32)

        self.start = tf.placeholder(tf.int32, [None], 'start-index')
        self.end = tf.placeholder(tf.int32, [None], 'end-index')

        with tf.variable_scope('input-embedding'):
            c_w = tf.nn.embedding_lookup(self.word_embed, self.c_words)
            q_w = tf.nn.embedding_lookup(self.word_embed, self.q_words)

            c_ch = layers.char_embed(self.c_chars,
                                     self.char_embed,
                                     dropout=self.dropout)
            q_ch = layers.char_embed(self.q_chars,
                                     self.char_embed,
                                     dropout=self.dropout,
                                     reuse=True)

            c = tf.concat([c_w, c_ch], -1)
            q = tf.concat([q_w, q_ch], -1)

        with tf.variable_scope('highway-1'):
            c = layers.highway(c, self.config.embed_size, dropout=self.dropout)
            q = layers.highway(q,
                               self.config.embed_size,
                               dropout=self.dropout,
                               reuse=True)

        with tf.variable_scope('highway-2'):
            c = layers.highway(c, self.config.embed_size, dropout=self.dropout)
            q = layers.highway(q,
                               self.config.embed_size,
                               dropout=self.dropout,
                               reuse=True)

        with tf.variable_scope('projection'):
            c = tf.layers.conv1d(c, self.config.filters, 1, padding='same')
            q = tf.layers.conv1d(q,
                                 self.config.filters,
                                 1,
                                 padding='same',
                                 reuse=True)

        with tf.variable_scope('input-encoder'):
            c = layers.encoder_block(c,
                                     num_blocks=1,
                                     num_convolutions=4,
                                     kernel=7,
                                     mask=self.c_mask,
                                     dropout=self.dropout)

            q = layers.encoder_block(q,
                                     num_blocks=1,
                                     num_convolutions=4,
                                     kernel=7,
                                     mask=self.q_mask,
                                     dropout=self.dropout,
                                     reuse=True)

        with tf.variable_scope('attention'):
            attention = layers.bi_attention(c, q, layers.trilinear(c, q),
                                            self.c_mask, self.q_mask)
            attention = tf.layers.conv1d(attention,
                                         self.config.filters,
                                         1,
                                         padding='same')

        modeling = [attention]
        for i in range(3):
            reuse = i > 0
            m = layers.encoder_block(modeling[i],
                                     num_blocks=7,
                                     num_convolutions=2,
                                     kernel=5,
                                     mask=self.c_mask,
                                     dropout=self.dropout,
                                     reuse=reuse)
            if i % 2 == 0:
                m = tf.nn.dropout(m, 1.0 - self.dropout)
            modeling.append(m)

        with tf.variable_scope('start-index') as scope:
            self.start_linear = tf.concat([modeling[-3], modeling[-2]], -1)
            self.start_linear = tf.squeeze(
                tf.layers.dense(self.start_linear, 1, use_bias=False), -1)
            self.pred_start = tf.nn.softmax(self.start_linear,
                                            name='pred-start')

        with tf.variable_scope('end-index') as scope:
            self.end_linear = tf.concat([modeling[-3], modeling[-1]], -1)
            self.end_linear = tf.squeeze(
                tf.layers.dense(self.end_linear, 1, use_bias=False), -1)
            self.pred_end = tf.nn.softmax(self.end_linear, name='pred-end')

        with tf.variable_scope('loss') as scope:
            loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.start_linear, labels=self.start)
            loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.end_linear, labels=self.end)
            loss = tf.reduce_mean(loss1 + loss2)
            lossL2 = tf.add_n([
                tf.nn.l2_loss(v)
                for v in tf.trainable_variables() if 'bias' not in v.name
            ]) * self.config.l2
            self.loss = loss + lossL2

        with tf.variable_scope('optimizer') as scope:
            optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            grads = tf.gradients(self.loss, tf.trainable_variables())
            grads, _ = tf.clip_by_global_norm(grads, self.config.grad_clip)
            grads_and_vars = zip(grads, tf.trainable_variables())
            self.optimize = optimizer.apply_gradients(
                grads_and_vars, global_step=self.global_step)

        if self.config.ema_decay > 0:
            with tf.variable_scope('ema') as scope:
                ema = tf.train.ExponentialMovingAverage(
                    decay=self.config.ema_decay)
                ema_op = ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)
                    assign_vars = []
                    for var in tf.global_variables():
                        v = ema.average(var)
                        if v:
                            assign_vars.append(tf.assign(var, v))
                self.assign_vars = assign_vars