示例#1
0
    def _output(self):
        # TODO: check whether to use the encodes before dropout or after dropout
        self.start_logits = tf.squeeze(
            fc(tf.concat([self.model_encodes[-3], self.model_encodes[-2]],
                         axis=-1),
               1,
               activation_fn=None,
               biases_initializer=None,
               scope='start_pointer'), -1)
        self.end_logits = tf.squeeze(
            fc(tf.concat([self.model_encodes[-3], self.model_encodes[-1]],
                         axis=-1),
               1,
               activation_fn=None,
               biases_initializer=None,
               scope='end_pointer'), -1)
        self.start_logits = mask_logits(self.start_logits, mask=self.c_mask)
        self.end_logits = mask_logits(self.end_logits, mask=self.c_mask)
        self.start_probs = tf.nn.softmax(self.start_logits)
        self.end_probs = tf.nn.softmax(self.end_logits)

        self.outer_product = tf.matmul(
            tf.expand_dims(self.start_probs, axis=2),
            tf.expand_dims(self.end_probs, axis=1))
        self.outer_product = tf.matrix_band_part(
            self.outer_product, 0,
            tf.cast(
                tf.minimum(
                    tf.shape(self.outer_product)[2] - 1, self.max_answer_len),
                tf.int64))
        self.pred_start = tf.argmax(tf.reduce_max(self.outer_product, axis=2),
                                    axis=1)
        self.pred_end = tf.argmax(tf.reduce_max(self.outer_product, axis=1),
                                  axis=1)
示例#2
0
    def _decode(self):

        N, PL, QL, CL, d, dc, nh = self._params()

        if self.config.use_position_attn:
            start_logits = tf.squeeze(
                conv(self._attention(tf.concat([self.enc[1], self.enc[2]], axis = -1), name="attn1"), 1, bias = False, name = "start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(self._attention(tf.concat([self.enc[1], self.enc[3]], axis = -1), name="attn2"), 1, bias = False, name = "end_pointer"), -1)
        else:
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis = -1), 1, bias = False, name = "start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis = -1), 1, bias = False, name = "end_pointer"), -1)

        self.logits = [mask_logits(start_logits, mask = tf.reshape(self.c_mask, [N, -1])),
                        mask_logits(end_logits, mask = tf.reshape(self.c_mask, [N, -1]))]

        self.logits1, self.logits2 = [l for l in self.logits]

        outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))

        outer = tf.matrix_band_part(outer, 0, self.max_a_len)
        self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
示例#3
0
文件: model2.py 项目: Yaozeng/MRC
    def _decode(self):

        start_logits = tf.squeeze(
            conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                 1,
                 bias=False,
                 name="start_pointer"), -1)
        end_logits = tf.squeeze(
            conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                 1,
                 bias=False,
                 name="end_pointer"), -1)

        self.logits = [
            mask_logits(start_logits, mask=self.c_mask),
            mask_logits(end_logits, mask=self.c_mask)
        ]

        self.logits1, self.logits2 = [l for l in self.logits]

        outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                          tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))

        outer = tf.matrix_band_part(outer, 0, self.max_a_len)
        self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
示例#4
0
    def _fuse(self):

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2),
                        [1, 1, self.max_q_len, 1])
            Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1),
                        [1, self.max_p_len, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, self.q_embed_encoding)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding)
            self.attention_outputs = [
                self.c_embed_encoding, self.c2q,
                self.c_embed_encoding * self.c2q,
                self.c_embed_encoding * self.q2c
            ]

        #  self.config.batch_size if not self.demo else 1,
        #  self.max_p_len,
        #  self.max_q_len,
        #  self.config.max_ch_len,
        #  self.config.hidden_size,
        #  self.config.char_embed_size,
        #  self.config.head_size
        N, PL, QL, CL, d, dc, nh = self._params()
        if self.config.fix_pretrained_vector:
            dc = self.char_mat.get_shape()[-1]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(self.attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=1,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=True,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

            for i, item in enumerate(self.enc):
                self.enc[i] = tf.reshape(self.enc[i],
                                         [N, -1, self.enc[i].get_shape()[-1]])
示例#5
0
 def _attention(self):
     sim_mat = trilinear_similarity(self.enc_c, self.enc_q)
     c2q_attn_weights = tf.nn.softmax(
         mask_logits(sim_mat, mask=tf.expand_dims(self.q_mask, 1)), 2)
     q2c_attn_weights = tf.nn.softmax(
         mask_logits(sim_mat, mask=tf.expand_dims(self.c_mask, 2)), 1)
     self.c2q = tf.matmul(c2q_attn_weights, self.enc_q)
     self.q2c = tf.matmul(
         tf.matmul(c2q_attn_weights, q2c_attn_weights, transpose_b=True),
         self.enc_c)
     self.attn_out = tf.concat([
         self.enc_c, self.c2q, self.enc_c * self.c2q, self.enc_c * self.q2c
     ],
                               axis=-1)
     self.attn_out = tf.nn.dropout(self.attn_out, 1 - self.dropout)
示例#6
0
    def _decode(self):
        """
        Employs Pointer Network to get the the probs of each position
        to be the start or end of the predicted answer.
        Note that we concat the fuse_p_encodes for the passages in the same document.
        And since the encodes of queries in the same document is same, we select the first one.
        """
        N, PL, QL, CL, d, dc, nh = self._params()
        if self.use_position_attn:
            start_logits = tf.squeeze(
                conv(self._attention(tf.concat([self.enc[1], self.enc[2]],
                                               axis=-1),
                                     name="attn1"),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(self._attention(tf.concat([self.enc[1], self.enc[3]],
                                               axis=-1),
                                     name="attn2"),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
        else:
            start_logits = tf.squeeze(
                conv(self.mul_anttion_p, 1, bias=False, name="start_pointer"),
                -1)
            end_logits = tf.squeeze(
                conv(self.mul_anttion_p, 1, bias=False, name="end_pointer"),
                -1)

        start_logits = tf.reshape(start_logits, [N, -1])
        self.sl = start_logits
        end_logits = tf.reshape(end_logits, [N, -1])
        self.el = end_logits
        self.logits = [
            mask_logits(start_logits, mask=tf.reshape(self.c_mask, [N, -1])),
            mask_logits(end_logits, mask=tf.reshape(self.c_mask, [N, -1]))
        ]

        self.logits1, self.logits2 = [l for l in self.logits]

        outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                          tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))

        outer = tf.matrix_band_part(outer, 0, self.max_a_len)
        self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
示例#7
0
文件: model2.py 项目: Yaozeng/MRC
    def _fuse(self):

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2),
                        [1, 1, self.max_q_len, 1])
            Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1),
                        [1, self.max_p_len, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, self.q_embed_encoding)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding)
            self.attention_outputs = [
                self.c_embed_encoding, self.c2q,
                self.c_embed_encoding * self.c2q,
                self.c_embed_encoding * self.q2c
            ]

        PL, QL, CL, d, dc, nh = self._params()
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(self.attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=3,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
示例#8
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads
        d_cell = tf.contrib.rnn.BasicLSTMCell(d,
                                              forget_bias=1.0,
                                              state_is_tuple=True)
        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            print "ch_emb before", ch_emb.shape[-1]
            print "qh_emb before", qh_emb.shape[-1]

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])
            print "N", N, "PL", PL, "QL", QL
            print "ch_emb", ch_emb.shape
            print "qh_emb", qh_emb.shape
            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)
            print "c_emb high", c_emb.shape
            print "q_emb high", q_emb.shape

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c_tmp = residual_block(c_emb,
                                   num_blocks=1,
                                   num_conv_layers=4,
                                   kernel_size=7,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Encoder_Residual_Block",
                                   bias=False,
                                   dropout=self.dropout)

            # c_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            c = drnn(d_cell, c_tmp, d)

            q_tmp = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            # q_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            q = drnn(d_cell, q_tmp, d)
            print "embd enc output c", c.shape
            print "embd enc output q", q.shape
            # exit()

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            print "enc len", len(self.enc)
            # print self.ch_len.shape
            # print self.qh_len.shape
            # print self.c_len.shape
            # print self.q_len.shape

            # print ip_len.shape
            print "qh shape", self.qh.shape
            print "qh type", self.qh.dtype
            print "ip shape", inputs.shape
            print "ip type", inputs.dtype
            ip_len = tf.reshape(
                tf.reduce_sum(tf.cast(tf.cast(inputs, tf.bool), tf.float32),
                              axis=2), [-1])
            print "ip_len", ip_len.shape

            # fw0 = drnn(d_cell, self.enc[0], d)
            # f_cell = tf.contrib.rnn.BasicLSTMCell(fw0[2], forget_bias=1.0, state_is_tuple=True)
            # fw1 = drnn(d_cell, fw0, d)
            # fw2 = drnn(d_cell, fw1, d)
            # self.enc.append(fw0)
            # self.enc.append(fw1)
            # self.enc.append(fw2)
            # print "fw1 shape", fw1
            #
            # (fw0, bw0), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
            #                   initial_state_fw=None, initial_state_bw=None,
            #                   dtype=None, parallel_iterations=None,
            #                   swap_memory=False, time_major=False, scope=None):

            # bw_cell = tf.contrib.rnn.BasicLSTMCell(d, forget_bias=1.0, state_is_tuple=True)
            # g0 = bidirlstm(fw_cell, bw_cell, inputs, d)
            # g1 = bidirlstm(fw_cell, bw_cell, g0, d)
            # g2 = bidirlstm(fw_cell, bw_cell, g1, d)
            # fw0 = bidirlstm(d_cell, d_cell, inputs, d)
            # d_cell1 = tf.contrib.rnn.BasicLSTMCell(fw0[1], forget_bias=1.0, state_is_tuple=True)
            # fw1 = bidirlstm(d_cell1, d_cell1, fw0, d)
            # (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(d_cell, d_cell, self.enc[0], dtype='float', scope='g0')  # [N, M, JX, 2d]
            # g0 = tf.concat([fw_g0, bw_g0], 4)

            # (fw_g1, bw_g1) = bidirectional_dynamic_rnn(d_cell, d_cell, fw_g0, dtype='float', scope='g1')  # [N, M, JX, 2d]
            # print "fw_g0", fw_g0.shape
            # print "bw_g0", bw_g0.shape

            # print g0.shape
            # (fw_g1, bw_g1), _ = bidirlstm(d_cell, d_cell, g0, dtype='float', scope='g1')  # [N, M, JX, 2d]
            # g1 = tf.concat([fw_g1, bw_g1], 3)
            # flat_output_fw = nest.flatten(fw_g0)
            # flat_output_bw = nest.flatten(bw_g0)

            # flat_outputs = tuple(array_ops.concat(1, [fw, bw])
            #                     for fw, bw in zip(flat_output_fw, flat_output_bw))

            # outputs = nest.pack_sequence_as(structure=output_fw,
            # flat_sequence=flat_outputs)
            # print "output", outputs.shape

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    drnn(
                        d_cell,
                        residual_block(self.enc[i],
                                       num_blocks=7,
                                       num_conv_layers=2,
                                       kernel_size=5,
                                       mask=self.c_mask,
                                       num_filters=d,
                                       num_heads=nh,
                                       seq_len=self.c_len,
                                       scope="Model_Encoder",
                                       bias=False,
                                       reuse=True if i > 0 else None,
                                       dropout=self.dropout), d))
            # print "enc[0] shape", self.enc[0].shape
            print "chalala"
            # exit()

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
示例#9
0
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        XL = self.x_maxlen

        # DEBUG
        self.debug_ops.extend([PL, QL, XL])

        CL = config.char_limit  # 16
        d = config.hidden       # 96
        dc = config.char_dim    # 64
        nh = config.num_heads   # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
                self.xh : (N, x_maxlen, 16)
            '''
            ######################################
            #get elmo embeddings
            ######################################
            datadir = "/data/elmo_experiment_20180906/20180906_model"
            vocab_file = os.path.join(datadir, 'vocab-2016-09-10.txt')
            options_file = os.path.join(datadir, 'options.json')
            weight_file = os.path.join(datadir, 'weights.hdf5')
            print(vocab_file)
            print(options_file)
            print(weight_file)
            
            # Create a Batcher to map text to character ids.
            batcher = Batcher(vocab_file, 50)
            
            # Input placeholders to the biLM.
            #context_character_ids = tf.placeholder('int32', shape=(None, None, 50))
            #question_character_ids = tf.placeholder('int32', shape=(None, None, 50))
            
            # Build the biLM graph.
            bilm = BidirectionalLanguageModel(options_file, weight_file)
            
            # Get ops to compute the LM embeddings.
            print(self.c)
            print(self.c.shape)
            #print(self.ch)
            #print(self.ch.shape)
            print(self.c_elmo)
            print(self.c_elmo.shape)
            print(self.q_elmo)
            print(self.q_elmo.shape)
            print(self.x_elmo)
            print(self.x_elmo.shape)
             
            context_embeddings_op = bilm(self.c_elmo)
            question_embeddings_op = bilm(self.q_elmo)
            candidate_embeddings_op = bilm(self.x_elmo)
            
            # Get an op to compute ELMo (weighted average of the internal biLM layers)
            # Our SQuAD model includes ELMo at both the input and output layers
            # of the task GRU, so we need 4x ELMo representations for the question
            # and context at each of the input and output.
            # We use the same ELMo weights for both the question and context
            # at each of the input and output.
            #context elmo
            elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                elmo_question_input = weight_layers(
                    'input', question_embeddings_op, l2_coef=0.0
                )
                elmo_candidate_input = weight_layers(
                    'input', candidate_embeddings_op, l2_coef=0.0
                )
            
            elmo_context_output = weight_layers(
                'output', context_embeddings_op, l2_coef=0.0
            )
            with tf.variable_scope('', reuse=True):
                # the reuse=True scope reuses weights from the context for the question
                elmo_question_output = weight_layers(
                    'output', question_embeddings_op, l2_coef=0.0
                )
                elmo_candidate_output = weight_layers(
                    'output', candidate_embeddings_op, l2_coef=0.0
                )
            
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) #(N*PL,16,64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) #(N*QL,16,64)
            xh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.xh), [N * XL, CL, dc]) #(N*XL,16,64)

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            xh_emb = tf.nn.dropout(xh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name = "char_conv", reuse = None) # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name = "char_conv", reuse = True) # (N*q_maxlen, 16-5+1, 96)
            xh_emb = conv(xh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5,
                          name="char_conv", reuse=True)  # (N*x_maxlen, 16-5+1, 96)

            # Max Pooling
            ch_emb = tf.reduce_max(ch_emb, axis = 1) # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis = 1) # (N*q_maxlen, 96)
            xh_emb = tf.reduce_max(xh_emb, axis = 1) # (N*x_maxlen, 96)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]]) # (N, q_maxlen, 96)
            xh_emb = tf.reshape(xh_emb, [N, XL, xh_emb.shape[-1]]) # (N, x_maxlen, 96)

            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
                self.x : (N, x_maxlen)
            '''
            #print(self.c)
            #print(self.q)
            #print(self.x)
            
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)#(N,c_maxlen,300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)#(N,q_maxlen,300)
            x_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.x), 1.0 - self.dropout)#(N,x_maxlen,300)

            #c_emb_elmo = 
            #q_emb_elmo = 
            #x_emb_elmo = 

            c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396)
            x_emb = tf.concat([x_emb, xh_emb], axis=2) # (N, x_maxlen, 396)
            
            print(c_emb)
            print(c_emb.shape)
            
            c_emb = tf.concat([elmo_context_output['weighted_op'], c_emb], axis=2) # (N, c_maxlen, 1024 + 396)
            q_emb = tf.concat([elmo_question_output['weighted_op'], q_emb], axis=2) # (N, q_maxlen, 1024 + 396)
            x_emb = tf.concat([elmo_candidate_output['weighted_op'], x_emb], axis=2) # (N, x_maxlen, 1024 + 396)
            
            print(c_emb)
            print(c_emb.shape)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)#(N,c_maxlen,96)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,q_maxlen,96)
            x_emb = highway(x_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)#(N,x_maxlen,96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)

            # optimization from jasonwbw
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
            # if config.q2c:
            #     attention_outputs.append(c * self.q2c)

        # with tf.variable_scope("Model_Encoder_Layer"):
        #     inputs = tf.concat(attention_outputs, axis = -1)
        #
        #     # same as a dxd MLP layer
        #     self.enc = [conv(inputs, d, name = "input_projection")] # d=hidden=96
        #
        #     for i in range(3):
        #         if i % 2 == 0: # dropout every 2 blocks
        #             self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
        #         self.enc.append(
        #             residual_block(self.enc[i],
        #                 num_blocks = 7,
        #                 num_conv_layers = 2,
        #                 kernel_size = 5,
        #                 mask = self.c_mask,
        #                 num_filters = d,
        #                 num_heads = nh,
        #                 seq_len = self.c_len,
        #                 scope = "Model_Encoder",
        #                 bias = False,
        #                 reuse = True if i > 0 else None,
        #                 dropout = self.dropout)
        #             )

            # DEBUG
            # self.debug_ops.append(inputs)
            # self.debug_ops.extend(self.enc)

        with tf.variable_scope("Output_Layer"):
            '''
                broadcasting:dimensions with size 1 are stretched or "copied" to match the other
            '''
            '''
                x_emb:              (N, x_maxlen, d)
                inputs:             (N, c_maxlen, 4*d)
                mask_x:             (N, x_maxlen, 1)
                c_proj:             (N, c_maxlen, d)
                S_xc/S_xc_:         (N, x_maxlen, c_maxlen)
                x2c:                (N, x_maxlen, d)
                xp_exp:             (N, x_maxlen, c_maxlen, 1)
                c_proj_exp:         (N, 1, c_maxlen, d)
                cand_context:       (N, x_maxlen, c_maxlen, d)
                cand_context_pool:  (N, x_maxlen, d)
                cand_condense:      (N, x_maxlen, d*2)
                self.cand_condense: (N, x_maxlen, d)
                self.cand_logits:   (N, x_maxlen, 1)
            '''
            inputs = tf.concat(attention_outputs, axis = -1)

            # masking candidate embedding
            mask_x = tf.expand_dims(self.x_mask, 2)
            c_proj = conv(inputs, d, name="context_projection")

            S_xc = optimized_trilinear_for_attention([x_emb, c_proj], self.x_maxlen, self.c_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)
            S_xc_ = tf.nn.softmax(mask_logits(S_xc, mask = mask_x))

            self.x2c = tf.matmul(S_xc_, c_proj)

            self.cand_condense = self.x2c

            if self.config.cand_condense_vector:
                xp_exp = tf.expand_dims(self.xp, axis=-1)
                c_proj_exp = tf.expand_dims(c_proj, axis=1)
                cand_context = tf.multiply(c_proj_exp, xp_exp)

                if self.config.cand_condense_conv:
                    cand_context = tf.reshape(cand_context, [N*XL, PL, d])
                    cand_context = conv(cand_context, d, bias=True, activation=tf.nn.relu,
                                        kernel_size=3, name="candidate_from_context")
                    cand_context = tf.reshape(cand_context, [N, XL, -1, d])

                if self.config.cand_condense_pool:
                    cand_context_pool = tf.reduce_max(cand_context, axis=-2)
                else:
                    cand_context_pool = tf.reduce_mean(cand_context, axis=-2)

                cand_condense = tf.concat([self.x2c, cand_context_pool], axis = -1)
                self.cand_condense = conv(cand_condense, d, name="candidate_projection")

                if self.config.cand_fuse_vector:
                    raise NotImplementedError

                # DEBUG
                self.debug_ops.extend([xp_exp, c_proj_exp, cand_context, cand_context_pool,
                                       cand_condense, self.cand_condense])

            if not config.max_margin:
                cand_logits = tf.squeeze(conv(self.cand_condense, 1, bias=False, name="candidate_logits_1"), -1)
                self.cand_logits = mask_logits(cand_logits, mask=self.x_mask)
                loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.cand_logits, labels=self.yx)
                # DEBUG
                self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c,
                                       self.x_mask, self.cand_logits, self.yx])
            else:
                cand_logits = conv(self.cand_condense, 1, bias=False, name="candidate_logits_1")
                cand_logits = tf.tanh(cand_logits)
                cand_logits = tf.squeeze(conv(cand_logits, 1, bias=False, name="candidate_logits_2"), -1)
                self.cand_logits = tf.sigmoid(cand_logits)
                pos = tf.multiply(self.cand_logits, self.yx)
                pos = tf.reduce_max(pos, axis=-1)
                negs = tf.multiply(self.cand_logits, self.yx_inv)
                neg = tf.reduce_max(negs, axis=-1)
                loss = tf.maximum(tf.add(tf.subtract(neg, pos), config.margin), 0.0)
                # DEBUG
                self.debug_ops.extend([loss, x_emb, c_proj, S_xc, S_xc_, self.x2c,
                                       self.x_mask, self.cand_logits, self.yx,
                                       pos, negs, neg, self.yx, self.yx_inv])

            self.loss = tf.reduce_mean(loss)

        # with tf.variable_scope("Output_Layer"):
        #     '''
        #         tf.matrix_band_part: Copy a tensor setting everything outside a central band
        #                              in each innermost matrix to zero.
        #         self.enc[i]:  (N, c_maxlen, d)
        #         start_logits: (N, c_maxlen)
        #         end_logits:   (N, c_maxlen)
        #         logits1:      (N, c_maxlen)
        #         logits2:      (N, c_maxlen)
        #         outer:        (N, c_maxlen, c_maxlen)
        #         self.c_mask:  (N, c_maxlen)
        #         yp1, yp2, losses, losses2: (N,)
        #     '''
        #
        #     # map vectors to scalars
        #     start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1,
        #                                    bias = False, name = "start_pointer"),-1)
        #     end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1,
        #                                  bias = False, name = "end_pointer"), -1)
        #     self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)]
        #
        #     logits1, logits2 = [l for l in self.logits]
        #
        #     losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1)
        #     losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2)
        #     self.loss = tf.reduce_mean(losses + losses2)
        #
        #     # find max-score span
        #     outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
        #                       tf.expand_dims(tf.nn.softmax(logits2), axis=1))
        #     # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
        #     outer = tf.matrix_band_part(outer, 0, config.ans_limit)
        #     self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        #     self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
        #
        #     # DEBUG
        #     self.debug_ops.extend([start_logits, end_logits, logits1, logits2,
        #                            outer, self.yp1, self.yp2, losses, losses2, self.loss])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                # change from commit f0c79cc93dc1dfdad2bc8abb712a53d078814a56 by Min on 27 Apr 18
                self.assign_vars = []
                # self.shadow_vars = []
                # self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
示例#10
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh, dw = config.test_batch_size if self.loop_function else config.batch_size, self.c_maxlen, self.q_maxlen, \
                                config.char_limit, config.hidden, config.char_dim, config.num_heads, config.glove_dim

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=2,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=2,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=2,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Decoder_Layer"):
            memory = tf.concat([self.enc[1], self.enc[2], self.enc[3]],
                               axis=-1)
            oups = tf.split(self.a, [1] * self.a_maxlen, 1)
            h = tf.tanh(
                _linear(tf.reduce_mean(memory, axis=1),
                        output_size=d,
                        bias=False,
                        scope="h_initial"))
            c = tf.tanh(
                _linear(tf.reduce_mean(memory, axis=1),
                        output_size=d,
                        bias=False,
                        scope="c_initial"))
            state = (c, h)
            outputs = []
            prev = None
            prev_probs = [0.0]
            symbols = []
            for i, inp in enumerate(oups):
                einp = tf.reshape(tf.nn.embedding_lookup(self.word_mat, inp),
                                  [N, dw])
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                if self.loop_function is not None and prev is not None:
                    with tf.variable_scope("loop_function", reuse=True):
                        einp, prev_probs, index, prev_symbol = self.loop_function(
                            prev, prev_probs, self.beam_size, i)
                        h = tf.gather(h, index)  # update prev state
                        state = tuple(tf.gather(s, index)
                                      for s in state)  # update prev state
                        for j, symbol in enumerate(symbols):
                            symbols[j] = tf.gather(
                                symbol, index)  # update prev symbols
                        for j, output in enumerate(outputs):
                            outputs[j] = tf.gather(
                                output, index)  # update prev outputs
                        symbols.append(prev_symbol)

                attn = tf.reshape(
                    multihead_attention(tf.expand_dims(h, 1),
                                        units=d,
                                        num_heads=nh,
                                        memory=memory,
                                        mask=self.c_mask,
                                        bias=False), [-1, nh * d])

                cinp = tf.concat([einp, attn], 1)
                h, state = self.cell(cinp, state)

                with tf.variable_scope("AttnOutputProjection"):
                    output = _linear([h] + [cinp],
                                     output_size=dw * 2,
                                     bias=False,
                                     scope="output")
                    output = tf.reshape(output, [-1, dw, 2])
                    output = tf.reduce_max(output, 2)  # maxout
                    outputs.append(output)

                if self.loop_function is not None:
                    prev = output

            if self.loop_function is not None:
                # process the last symbol
                einp, prev_probs, index, prev_symbol = self.loop_function(
                    prev, prev_probs, self.beam_size, i + 1)
                for j, symbol in enumerate(symbols):
                    symbols[j] = tf.gather(symbol,
                                           index)  # update prev symbols
                for j, output in enumerate(outputs):
                    outputs[j] = tf.gather(output,
                                           index)  # update prev outputs
                symbols.append(prev_symbol)

                # output the final best result of beam search
                for k, symbol in enumerate(symbols):
                    symbols[k] = tf.gather(symbol, 0)
                for k, output in enumerate(outputs):
                    outputs[k] = tf.expand_dims(tf.gather(output, 0), 0)

            self.gen_loss = self._compute_loss(outputs, oups, N)
            self.symbols = symbols

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        self.loss = self.gen_loss

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
示例#11
0
文件: model.py 项目: wanghm92/QANet
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        CL = config.char_limit  # 16
        d = config.hidden  # 96
        dc = config.char_dim  # 64
        nh = config.num_heads  # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
            '''
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])  # (N*c_maxlen, 16, 64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])  # (N*q_maxlen, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)  # (N*q_maxlen, 16-5+1, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)  # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis=1)  # (N*q_maxlen, 96)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb,
                                [N, QL, ch_emb.shape[-1]])  # (N, q_maxlen, 96)
            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
            '''
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.c),
                                  1.0 - self.dropout)  # (N, c_maxlen, 300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.q),
                                  1.0 - self.dropout)  # (N, q_maxlen, 300)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)  # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)  # (N, q_maxlen, 396)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)  # (N, c_maxlen, 96)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)  # (N, q_maxlen, 96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1])
            Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            attention_outputs = [c, self.c2q, c * self.c2q]
            if config.q2c:
                attention_outputs.append(c * self.q2c)

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d,
                             name="input_projection")]  # d=hidden=96
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            '''
                tf.matrix_band_part: Copy a tensor setting everything outside a central band 
                                     in each innermost matrix to zero.
                self.enc[i]:  (N, c_maxlen, d)
                start_logits: (N, c_maxlen)
                end_logits:   (N, c_maxlen)
                logits1:      (N, c_maxlen)
                logits2:      (N, c_maxlen)
                outer:        (N, c_maxlen, c_maxlen)
                yp1, yp2, losses, losses2: (N,)  
            '''
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

            # find max-score span
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            #DEBUG
            self.debug_ops.extend([
                self.enc[1], start_logits, end_logits, logits1, logits2, outer,
                self.yp1, self.yp2, losses, losses2, self.loss
            ])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
示例#12
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])

            # shape = (?, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            # d(hidden_size) = 96
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)
            # shape = (?, 12, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            # shape = (?, 96)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])
            # shape = (32, ?, 96)

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            # self.enc[1] = (32, ?, 96)
            conv1 = conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                         1,
                         bias=False,
                         name="start_pointer")
            # tf.shape(conv1) = (32, ?, 1)
            start_logits = tf.squeeze(conv1, -1)
            # tf.shape(start_logits) = (32, ?)
            conv2 = conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                         1,
                         bias=False,
                         name="end_pointer")
            end_logits = tf.squeeze(conv2, -1)

            # mask ??
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            # shape = (32, ?) -> cause the context length is variable
            # matmul([32, ?, 1] x [32, 1, ?])
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # outer = (32, ?, ?)
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)

            reduced1 = tf.reduce_max(outer, axis=2)
            reduced2 = tf.reduce_max(outer, axis=1)
            # tf.shape(reduced) = (32, ?)

            # ###############################################
            paddings = [[0, 0], [0, self.MAX_PL - tf.shape(reduced1)[0]]]

            reduced1 = tf.pad(reduced1, paddings, "CONSTANT")
            reduced2 = tf.pad(reduced2, paddings, "CONSTANT")

            reduced1 = tf.slice(reduced1, [0, 0], [N, self.MAX_PL])
            reduced2 = tf.slice(reduced2, [0, 0], [N, self.MAX_PL])
            # tf.shape(reduced) = (32, ?)

            # no answer flag: (no_answer, answer_exist)
            # TODO add additinal layer
            # TODO dimenstion between reduced and weight
            na_flag1 = tf.cast(
                tf.argmax(tf.matmul(reduced1, self.weights1), axis=1),
                tf.float32)
            na_flag2 = tf.cast(
                tf.argmax(tf.matmul(reduced2, self.weights2), axis=1),
                tf.float32)
            # Tensor("Output_Layer/ArgMax:0", shape=(32, ?), dtype=int64)

            self.yp1 = tf.argmax(reduced1, axis=1)
            self.yp2 = tf.argmax(reduced2, axis=1)

            print(tf.reduce_sum(reduced1, axis=1))
            print(tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)))
            print(
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                           labels=self.y1))

            # no_answer
            losses = tf.where(
                self.no_answer > 0,
                tf.multiply(na_flag1, tf.reduce_sum(reduced1, axis=1)),
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                           labels=self.y1))

            losses2 = tf.where(
                self.no_answer > 0,
                tf.multiply(na_flag2, tf.reduce_sum(reduced2, axis=1)),
                tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits2,
                                                           labels=self.y2))

            #################################################
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
示例#13
0
    def forward(self, trainable):
        config = self.config
        N, PL, QL, CL, d, dc, nh= config.batch_size,self.c_maxlen, self.q_maxlen,\
                                               config.char_limit, config.hidden, config.char_dim, \
                                               config.num_heads,

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.ch),
                [N * PL, CL, dc])  #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  #[batch,feature_len,d]
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(
                ch_emb, axis=1)  #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  #最终转变为句子长度对应的维度,
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat(
                [c_emb, ch_emb],
                axis=2)  #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(
                c_emb,
                size=d,
                scope="highway",
                dropout=self.dropout,
                reuse=None)  #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
        with tf.variable_scope('question_rnn'):
            self.gru = tf.contrib.rnn.GRUCell(d)
            initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32)
            output, state = tf.nn.dynamic_rnn(self.gru,
                                              q,
                                              initial_state=initstate)
            # self.qandc=tf.concat([self.q2c,self.c2q],axis=2)
            # self.qandc=dense(self.qandc,d)
            # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75)

            state = tf.expand_dims(state, axis=2)
            weight1 = tf.matmul(self.enc[1], state)
            weight2 = tf.matmul(self.enc[2], state)
            weight3 = tf.matmul(self.enc[3], state)

            weight_enc1 = tf.multiply(self.enc[1], weight1)
            weight_enc1 = tf.reduce_sum(weight_enc1, axis=1)

            weight_enc2 = tf.multiply(self.enc[2], weight2)
            weight_enc2 = tf.reduce_sum(weight_enc2, axis=1)

            weight_enc3 = tf.multiply(self.enc[3], weight3)
            weight_enc3 = tf.reduce_sum(weight_enc3, axis=1)

        with tf.variable_scope("Output_Layer"):
            print(weight_enc1, "ggggggggggggggggg")
            inputs_shape = weight_enc1.get_shape().as_list()
            W = tf.get_variable(
                "W",
                shape=[inputs_shape[-1], 3],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
            self.l2_loss += tf.nn.l2_loss(W)
            self.l2_loss += tf.nn.l2_loss(b)
            self.scores1 = tf.nn.xw_plus_b(weight_enc1, W, b, name="scores")
            self.scores2 = tf.nn.xw_plus_b(weight_enc2, W, b, name="scores")
            self.scores3 = tf.nn.xw_plus_b(weight_enc3, W, b, name="scores")
            self.scores = (self.scores1 + self.scores2 + self.scores3) / 3.0
            print(self.scores)
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            if trainable:
                with tf.name_scope("loss"):
                    print(self.scores, self.input_y, "llllllllllllllll")
                    losses = tf.nn.softmax_cross_entropy_with_logits(
                        logits=self.scores, labels=self.input_y)
                    self.loss = tf.reduce_mean(
                        losses) + self.l2_reg_lambda * self.l2_loss
                    # Accuracy
                with tf.name_scope("accuracy"):
                    correct_predictions = tf.equal(self.predictions,
                                                   tf.argmax(self.input_y, 1))
                    self.accuracy = tf.reduce_mean(tf.cast(
                        correct_predictions, "float"),
                                                   name="accuracy")
                # losses2 = tf.nn.softmax_cross_entropy_with_logits(
                #     logits=logits2, labels=self.y2)
                if config.decay is not None:
                    self.var_ema = tf.train.ExponentialMovingAverage(
                        config.decay)
                    ema_op = self.var_ema.apply(tf.trainable_variables())
                    with tf.control_dependencies([ema_op]):
                        self.loss = tf.identity(self.loss)

                        self.assign_vars = []
                        for var in tf.global_variables():
                            v = self.var_ema.average(var)
                            if v:
                                self.assign_vars.append(tf.assign(var, v))
                self.lr = tf.minimum(
                    config.init_lr, 0.001 / tf.log(999.) *
                    tf.log(tf.cast(self.global_step, tf.float32) + 1))
                self.opt = tf.train.AdamOptimizer(learning_rate=self.lr,
                                                  beta1=0.8,
                                                  beta2=0.999,
                                                  epsilon=1e-7)
                grads = self.opt.compute_gradients(self.loss)
                gradients, variables = zip(*grads)
                capped_grads, _ = tf.clip_by_global_norm(
                    gradients, config.grad_clip)
                self.train_op = self.opt.apply_gradients(
                    zip(capped_grads, variables), global_step=self.global_step)
                self.saver = tf.train.Saver(tf.global_variables(),
                                            max_to_keep=3)
示例#14
0
    def build_model(self):
        PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.contc_input),
                [-1, CL, dc])
            qh_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
                [-1, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.contw_input),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.quesw_input),
                1.0 - self.dropout)

            # if self.use_cove:
            #     c_emb_cove = self.cove_model(c_emb)
            #     q_emb_cove = self.cove_model(q_emb)
            #     c_emb = tf.concat([c_emb, c_emb_cove], axis=-1)
            #     q_emb = tf.concat([q_emb, q_emb_cove], axis=-1)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if self.use_elmo:
                c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.cont_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.ques_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,
                               bias=False,
                               dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_inputs = tf.concat(attention_outputs, axis=-1)
            enc = [conv(attention_inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout)
                enc.append(
                    residual_block(enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.cont_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.concat([enc[1], enc[2]], axis=-1)
            end_logits = tf.concat([enc[1], enc[3]], axis=-1)
            if self.use_elmo:
                start_logits = tf.concat((start_logits, self.cont_elmo),
                                         axis=-1)
                end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1)

            start_logits = tf.squeeze(
                conv(start_logits, 1, bias=False, name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(end_logits, 1, bias=False, name="end_pointer"), -1)
            unanswer_bias = tf.get_variable(
                "unanswer_bias", [1],
                regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7),
                initializer=tf.zeros_initializer())
            unanswer_bias = tf.reshape(
                tf.tile(unanswer_bias, [self.batch_size]), [-1, 1])
            self.logits1 = tf.concat(
                (unanswer_bias, mask_logits(start_logits, mask=self.c_mask)),
                axis=-1)
            self.logits2 = tf.concat(
                (unanswer_bias, mask_logits(end_logits, mask=self.c_mask)),
                axis=-1)
            start_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)
            if self.l2_norm is not None:
                variables = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                l2_loss = tf.contrib.layers.apply_regularization(
                    regularizer, variables)
                self.loss += l2_loss

            # output
            outer = tf.matmul(
                tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)
            self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) - 1
            self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) - 1

            if self.decay is not None:
                self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
                ema_op = self.var_ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)
                    self.assign_vars = []
                    for var in tf.global_variables():
                        v = self.var_ema.average(var)
                        if v is not None:
                            self.assign_vars.append(tf.assign(var, v))
    def pred(self):
        with tf.variable_scope("embedding_layer"):
            (self.questions, question_lengths), (
                self.contexts,
                context_lengths), self.answers = self.iterator.get_next()

            max_context_length = tf.reduce_max(context_lengths)
            max_question_length = tf.reduce_max(question_lengths)

            #max_context_length = self.train_max_context_length
            #max_question_length = self.train_max_question_length

            context_mask = tf.sequence_mask(context_lengths,
                                            maxlen=max_context_length)

            question_mask = tf.sequence_mask(question_lengths,
                                             maxlen=max_question_length)

            question_embeddings = tf.nn.embedding_lookup(
                self.embedding, self.questions)
            context_embeddings = tf.nn.embedding_lookup(
                self.embedding, self.contexts)
            print('question_embeddings',
                  question_embeddings.get_shape().as_list())
            print('context_embeddings',
                  context_embeddings.get_shape().as_list())

        with tf.variable_scope("embedding_layer"):
            c = residual_block(context_embeddings,
                               num_blocks=1,
                               num_conv_layers=1,
                               kernel_size=7,
                               mask=context_mask,
                               num_filters=self.lstm_hidden_size,
                               num_heads=1,
                               seq_len=max_context_length,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=1.0 - self.keep_prob)
            print('c', c.get_shape().as_list())
            q = residual_block(
                question_embeddings,
                num_blocks=1,
                num_conv_layers=1,
                kernel_size=7,
                mask=question_mask,
                num_filters=self.lstm_hidden_size,
                num_heads=1,
                seq_len=max_question_length,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=1.0 - self.keep_prob)

            print('q', q.get_shape().as_list())
            # context_output dimension is BS * max_context_length * d
            # where d = 2*lstm_hidden_size

        with tf.variable_scope("attention_layer"):

            S = optimized_trilinear_for_attention(
                [c, q],
                max_context_length,
                max_question_length,
                input_keep_prob=self.keep_prob)
            mask_q = tf.expand_dims(question_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(context_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("modeling_layer"):
            attention = tf.concat(attention_outputs, axis=-1)
            self.enc = [
                conv(attention, self.lstm_hidden_size, name="input_projection")
            ]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], self.keep_prob)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=1,
                                   num_conv_layers=1,
                                   kernel_size=5,
                                   mask=context_mask,
                                   num_filters=self.lstm_hidden_size,
                                   num_heads=1,
                                   seq_len=max_context_length,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=1.0 - self.keep_prob))
                print('self.enc[i]', self.enc[i].get_shape().as_list())

        with tf.variable_scope("output_layer_start"):
            pred_start = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            print('pred_start', pred_start.get_shape().as_list())
            self.pred_start = preprocess_softmax(pred_start, context_mask)
            print('self.pred_start', self.pred_start.get_shape().as_list())

        with tf.variable_scope("output_layer_end"):
            pred_end = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            print('pred_end', pred_end.get_shape().as_list())
            self.pred_end = preprocess_softmax(pred_end, context_mask)
            print('self.pred_end', self.pred_end.get_shape().as_list())

            self.preds = tf.transpose([
                tf.argmax(self.pred_start, axis=1),
                tf.argmax(self.pred_end, axis=1)
            ])
示例#16
0
            self.c_enc = BiRNNEncoder(hidden_size=d, num_layers=1,
                                      name='d_enc')(c_emb, self.c_len)

            print('self.q_enc shape: {}'.format(self.q_enc.shape))
            print('self.c_enc shape: {}'.format(self.c_enc.shape))


        """*************************************** Start ****************************************"""
		with tf.variable_scope('Output_Layer'):

			with tf.variable_scope('attention'):
				# [N, PL]
				res = tf.matmul(tf.expand_dims(self.q_enc, -1), self.c_enc, adjoint_a=True, adjoint_b=True)

				attn = tf.reshape(res, [-1, self.c_maxlen])
				attn_dist = tf.nn.softmax(mask_logits(attn, self.c_mask))

			# Attention sum
			# y_hat = sum_probs_batch(self.cans, self.c, attn_dist)
			with tf.variable_scope('attention_sum'):
				# [N, 10, PL]
				y_hat = tf.cast(self.cans, tf.float32) * \ 
							tf.tile(tf.expand_dims(attn_dist, axis=1), [1, config.num_cans, 1])
				y_hat = tf.reduce_sum(y_hat, axis=-1)   # [N, 10]

			with tf.variable_scope('loss'):
				# - log loss
				self.loss = -tf.reduce_mean(
					tf.log(tf.reduce_sum(tf.to_float(self.ans) * attn_dist, axis=-1) + tf.constant(0.00001))
				)
			with tf.variable_scope('correct_prediction'): 
示例#17
0
    def forward(self):
        config = self.config
        '''
        N: batch_size
        PL: passage最大长度
        QL: question最大长度
        CL: 单词最大字母长度
        d: 输出通道数
        dc: 字母的嵌入维度
        nh: 自注意力的头数
        '''
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads
        # Embedding层:获取词向量和字符向量的拼接
        with tf.variable_scope("Input_Embedding_Layer"):
            # # character嵌入:
            # 1、先对单词的每个字母进行char2vec
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # 2、将单词对应的word2vec矩阵通过conv编码成向量
            # 卷积 ch_emb_shape = [N * PL, CL-5+1, d], qh_emb_shape = [N * QL, CL-5+1, d]
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            # max_time_pooling
            # ch_emb_shape = [N * PL, d], qh_emb_shape = [N * QL, d]
            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            # ch_emb_shape = [N, PL, d], qh_emb_shape = [N, QL, d]
            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            # # 词嵌入:从glove获取
            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            # 拼接词向量和字符向量
            # c_emb_size = [batch, n_c, c_emb+ch_emb]
            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            # q_emb_size = [batch, n_q, c_emb + ch_emb]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            # 分别通过highway网络
            # c_emb_size = [batch, n_c, d]
            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            # c_emb_size = [batch, n_q, d]
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        # Stacking Embedding Encoder Block的实现:共1个encoder block,每个7个卷积层,卷积核数d=96
        with tf.variable_scope("Embedding_Encoder_Layer"):
            # c_size = [batch, n_c, d]
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # q_size = [batch, n_q, d]
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=
                True,  # 共享passage和question的Stacking Embedding Encoder Block的权重
                bias=False,
                dropout=self.dropout)

        # Context-Query-Attention实现:
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            # S_size = [batch, n_c, n_q], q_size = [batch, n_q, d], c_size = [batch, n_c, d]
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            # n_q方向进行softmax
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q), dim=-1)
            mask_c = tf.expand_dims(self.c_mask, 2)
            # n_c方向进行softmax
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            # c2q_size = [batch, n_c, d]
            self.c2q = tf.matmul(S_, q)
            # q2c_size = [batch, n_c, d]
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            # attention_size = [4, batch, n_c, d]
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        # Stacked Model Encoder Blocks实现:共7个encoder block,每个2个卷积层,卷积核数d=96
        with tf.variable_scope("Model_Encoder_Layer"):
            # c, self.c2q, c * self.c2q, c * self.q2c 按照通道维度进行合并
            # input_shape = [batch, n_c, 4d]
            inputs = tf.concat(attention_outputs, axis=-1)
            # self.enc[i]_shape = [batch, n_c, d]
            self.enc = [conv(inputs, d, name="input_projection")]
            # 3个Stacked Model Encoder Blocks
            for i in range(3):
                if i % 2 == 0:  # 每两层进行一次dropout
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(
                        self.enc[i],
                        num_blocks=7,
                        num_conv_layers=2,
                        kernel_size=5,
                        mask=self.c_mask,
                        num_filters=d,
                        num_heads=nh,
                        seq_len=self.c_len,
                        scope="Model_Encoder",
                        bias=False,
                        reuse=True if i > 0 else
                        None,  # 共享同一个Stacked Model Encoder Blocks的权重
                        dropout=self.dropout))

        # 输出层实现:
        with tf.variable_scope("Output_Layer"):
            # 合并Stacked Model Encoder Blocks的第一个和第二个输出,并和并通道
            # start_logits_shape = [batch, n_c, 1]
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            # 合并Stacked Model Encoder Blocks的第一个和第三个输出,并和并通道
            # end_logits_shape = [batch, n_c, 1]
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)

            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            # outer_shape = [bacth, n_c, n_c]
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # 保留行坐标<纵坐标,且行坐标+纵坐标<=ans_limit的数据,其余置0
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            # 最大值的行坐标,代表起始位置
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            # 最大值的列坐标,代表结束位置
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        # L2正则化
        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            # control_dependencies传入的操作是先于with后的操作
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
示例#18
0
文件: model.py 项目: txye/QANet
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

			# Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None)
            qh_emb = conv(qh_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True)

            ch_emb = tf.reduce_max(ch_emb, axis = 1)
            qh_emb = tf.reduce_max(qh_emb, axis = 1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis = -1)
            self.enc = [conv(inputs, d, name = "input_projection")]
            for i in range(3):
                if i % 2 == 0: # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                        num_blocks = 7,
                        num_conv_layers = 2,
                        kernel_size = 5,
                        mask = self.c_mask,
                        num_filters = d,
                        num_heads = nh,
                        seq_len = self.c_len,
                        scope = "Model_Encoder",
                        bias = False,
                        reuse = True if i > 0 else None,
                        dropout = self.dropout)
                    )

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1)
            end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1)
            self.logits = [mask_logits(start_logits, mask = self.c_mask),
                           mask_logits(end_logits, mask = self.c_mask)]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var,v))
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh, AL1,AL2,AL3= config.batch_size,self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads,self.aletr1_maxlen, \
                                               self.aletr2_maxlen,self.aletr3_maxlen

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.ch),
                [N * PL, CL, dc])  #[一个句子共有多少单词,每个单词的字符个数,每一个字符的维度]
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            self.alternati_emb1 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter1h),
                [N * AL1, CL, dc])  # (875, 25, 20)
            self.alternati_emb2 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter2h),
                [N * AL2, CL, dc])  # (768, 16, 300)
            self.alternati_emb3 = tf.reshape(
                tf.nn.embedding_lookup(self.char_mat, self.alter3h),
                [N * AL3, CL, dc])  # (768, 16, 300)

            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            alternati_emb1 = tf.nn.dropout(self.alternati_emb1,
                                           1.0 - 0.5 * self.dropout)
            alternati_emb2 = tf.nn.dropout(self.alternati_emb2,
                                           1.0 - 0.5 * self.dropout)
            alternati_emb3 = tf.nn.dropout(self.alternati_emb3,
                                           1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder以下是得到卷积之后的特征输出
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  #[batch,feature_len,d]
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)
            alternati_emb1 = conv(alternati_emb1,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)
            alternati_emb2 = conv(alternati_emb2,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)
            alternati_emb3 = conv(alternati_emb3,
                                  d,
                                  bias=True,
                                  activation=tf.nn.relu,
                                  kernel_size=5,
                                  name="char_conv",
                                  reuse=True)

            ch_emb = tf.reduce_max(
                ch_emb, axis=1)  #求出横向唯独的最大特征,这里可以用k_max尝试,而没有用max_pooling
            qh_emb = tf.reduce_max(qh_emb, axis=1)
            alternati_emb1 = tf.reduce_max(alternati_emb1, axis=1)
            alternati_emb2 = tf.reduce_max(alternati_emb2, axis=1)
            alternati_emb3 = tf.reduce_max(alternati_emb3, axis=1)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  #最终转变为句子长度对应的维度,
            qh_emb = tf.reshape(qh_emb, [N, QL, qh_emb.shape[-1]])
            alternati_emb1 = tf.reshape(alternati_emb1,
                                        [N, AL1, qh_emb.shape[-1]])
            alternati_emb2 = tf.reshape(alternati_emb2,
                                        [N, AL2, qh_emb.shape[-1]])
            alternati_emb3 = tf.reshape(alternati_emb3,
                                        [N, AL3, qh_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)
            alter_embedding1 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter1)  # 上下文
            alter_embedding2 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter2)  # 上下文
            alter_embedding3 = tf.nn.embedding_lookup(self.word_mat,
                                                      self.alter3)  # 上下文

            c_emb = tf.concat(
                [c_emb, ch_emb],
                axis=2)  #把字符与对应的特征进行连接[batch,sequence_len,对应的输出维度]
            q_emb = tf.concat([q_emb, qh_emb], axis=2)
            alter_embedding1 = tf.concat([alter_embedding1, alternati_emb1],
                                         axis=2)
            alter_embedding2 = tf.concat([alter_embedding2, alternati_emb2],
                                         axis=2)
            alter_embedding3 = tf.concat([alter_embedding3, alternati_emb3],
                                         axis=2)

            c_emb = highway(
                c_emb,
                size=d,
                scope="highway",
                dropout=self.dropout,
                reuse=None)  #相当于对信息进行一次筛选,并且让表示的维度降低到75,[batch,sql_len,75]
            self.alter_embedding1 = c_emb

            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)
            alter_embedding1 = highway(alter_embedding1,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)
            alter_embedding2 = highway(alter_embedding2,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)
            alter_embedding3 = highway(alter_embedding3,
                                       size=d,
                                       scope="highway",
                                       dropout=self.dropout,
                                       reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter1 = residual_block(
                alter_embedding1,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter1_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alterh1_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter2 = residual_block(
                alter_embedding2,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter2_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alter2_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
            alter3 = residual_block(
                alter_embedding3,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.alter3_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.alter3_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]

            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
        with tf.variable_scope('question_rnn'):
            self.gru = tf.contrib.rnn.GRUCell(d)
            initstate = self.gru.zero_state(batch_size=N, dtype=tf.float32)
            output, state = tf.nn.dynamic_rnn(self.gru,
                                              q,
                                              initial_state=initstate)
            # self.qandc=tf.concat([self.q2c,self.c2q],axis=2)
            # self.qandc=dense(self.qandc,d)
            # output,state=tf.nn.dynamic_rnn(self.gru,self.qandc,initial_state=initstate)#(32,?,75)
            output1, state1 = tf.nn.dynamic_rnn(self.gru,
                                                alter1,
                                                initial_state=state)
            output2, state2 = tf.nn.dynamic_rnn(self.gru,
                                                alter2,
                                                initial_state=state)
            output3, state3 = tf.nn.dynamic_rnn(self.gru,
                                                alter3,
                                                initial_state=state)

            state = tf.expand_dims(state, axis=2)
            weight1 = tf.matmul(self.enc[1], state)
            weight2 = tf.matmul(self.enc[2], state)
            weight3 = tf.matmul(self.enc[3], state)

            weight_enc1 = tf.multiply(self.enc[1], weight1)
            weight_enc1 = tf.reduce_sum(weight_enc1, axis=1)

            weight_enc2 = tf.multiply(self.enc[2], weight2)
            weight_enc2 = tf.reduce_sum(weight_enc2, axis=1)

            weight_enc3 = tf.multiply(self.enc[3], weight3)
            weight_enc3 = tf.reduce_sum(weight_enc3, axis=1)

        with tf.variable_scope("Output_Layer"):
            # start_logits = tf.squeeze(
            #     conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1)
            # end_logits = tf.squeeze(
            #     conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1)
            # self.logits = [mask_logits(start_logits, mask=self.c_mask),
            #                mask_logits(end_logits, mask=self.c_mask)]
            #
            # logits1, logits2 = [l for l in self.logits]
            #
            # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
            #                   tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            # outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            # self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            # self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            similary1 = tf.expand_dims(self.cos_sine(weight_enc1, state1),
                                       axis=1)
            similary2 = tf.expand_dims(self.cos_sine(weight_enc2, state2),
                                       axis=1)
            similary3 = tf.expand_dims(self.cos_sine(weight_enc3, state3),
                                       axis=1)
            self.logits1 = tf.nn.softmax(
                tf.concat([similary1, similary2, similary3], axis=1))
            print(self.logits1, "lllllllllllllllllllllllllllllllllllll")
示例#20
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, \
            self.q_maxlen, config.char_limit, config.hidden, config.tw_char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            if config.type == "all":
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
                qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

                # Bidaf style conv-highway encoder
                ch_emb = conv(ch_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=None)
                qh_emb = conv(qh_emb,
                              d,
                              bias=True,
                              activation=tf.nn.relu,
                              kernel_size=5,
                              name="char_conv",
                              reuse=True)

                ch_emb = tf.reduce_max(ch_emb, axis=1)
                qh_emb = tf.reduce_max(qh_emb, axis=1)

                ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
                qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

                c_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.word_mat, self.c),
                    1.0 - self.dropout)
                q_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.word_mat, self.q),
                    1.0 - self.dropout)

                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)

            elif config.type == 'char':
                c_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.char_mat, self.c),
                    1.0 - self.dropout)
                q_emb = tf.nn.dropout(
                    tf.nn.embedding_lookup(self.char_mat, self.q),
                    1.0 - self.dropout)

                c_emb = highway(c_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=None)
                q_emb = highway(q_emb,
                                size=d,
                                scope="highway",
                                dropout=self.dropout,
                                reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            # guess : mask the padding part pad in the end of the passage
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits1,
                                                                labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
示例#21
0
    def build_model(self):
        PL, QL, CL, d, dc, nh = self.c_maxlen, self.q_maxlen, self.char_limit, self.filters, self.char_dim, self.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.contc_input), [-1, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.quesc_input), [-1, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None)
            qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [-1, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [-1, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.contw_input), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.quesw_input), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if self.use_elmo:
                c_emb = tf.concat([c_emb, self.cont_elmo], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_elmo], axis=-1)

            c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None)
            q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.cont_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(q_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.q_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.ques_len,
                               scope="Encoder_Residual_Block",
                               reuse=True,
                               bias=False,
                               dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen,
                                                  input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            attention_inputs = tf.concat(attention_outputs, axis=-1)
            enc = [conv(attention_inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - self.dropout)
                enc.append(residual_block(enc[i],
                                          num_blocks=7,
                                          num_conv_layers=2,
                                          kernel_size=5,
                                          mask=self.c_mask,
                                          num_filters=d,
                                          num_heads=nh,
                                          seq_len=self.cont_len,
                                          scope="Model_Encoder",
                                          bias=False,
                                          reuse=True if i > 0 else None,
                                          dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.concat([enc[1], enc[2]], axis=-1)
            end_logits = tf.concat([enc[1], enc[3]], axis=-1)
            if self.use_elmo:
                start_logits = tf.concat((start_logits, self.cont_elmo), axis=-1)
                end_logits = tf.concat((end_logits, self.cont_elmo), axis=-1)
            start_logits = tf.squeeze(conv(start_logits, 1, bias=False, name="start_pointer"), -1)
            end_logits = tf.squeeze(conv(end_logits, 1, bias=False, name="end_pointer"), -1)
            # 2.0 Dataset
            # unanswer_bias = tf.get_variable("unanswer_bias", [1],
            #                                 regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7),
            #                                 initializer=tf.zeros_initializer())
            # unanswer_bias = tf.reshape(tf.tile(unanswer_bias, [self.batch_size]), [-1, 1])
            # self.logits1 = tf.concat((unanswer_bias, mask_logits(start_logits, mask=self.c_mask)), axis=-1)
            # self.logits2 = tf.concat((unanswer_bias, mask_logits(end_logits, mask=self.c_mask)), axis=-1)
            self.logits1 = mask_logits(start_logits, mask=self.c_mask)
            self.logits2 = mask_logits(end_logits, mask=self.c_mask)
            start_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)

            # output
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)
            self.output1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.output2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

        if self.use_topk:
            with tf.variable_scope("Topk_Layer"):
                top_size = 3
                outer = tf.reshape(outer, [self.batch_size, -1])
                outer_inds = tf.nn.top_k(outer, top_size).indices  # [N,top_size]
                self.yp1 = outer_inds // tf.shape(self.logits1)[-1]
                self.yp2 = outer_inds % tf.shape(self.logits2)[-1]

                def sen_mask(tensor):
                    def sen_mask_(a, b, filters):
                        try:
                            mata = tf.zeros([a, filters], tf.int32)
                        except:
                            mata = []
                        matb = tf.ones([b - a, filters], tf.int32)
                        matc = tf.zeros([tf.shape(self.logits1)[-1] - b, filters], tf.int32)
                        mat = tf.concat((mata, matb, matc), axis=0)
                        return mat

                    return tf.map_fn(lambda x: sen_mask_(x[0], x[1], self.filters), tensor)

                self.yp3 = self.yp2 + 1
                self.yp1 = tf.expand_dims(self.yp1, -1)
                self.yp2 = tf.expand_dims(self.yp2, -1)
                self.yp3 = tf.expand_dims(self.yp3, -1)
                self.y_mask = tf.concat([self.yp1, self.yp3], axis=-1)
                self.y_mask = tf.map_fn(lambda x: sen_mask(x), self.y_mask)

                # answer
                c = tf.tile(tf.expand_dims(c2q, 1), [1, top_size, 1, 1])
                c_topk = tf.multiply(tf.cast(self.y_mask, tf.float32), c)
                W1 = tf.get_variable("W1", initializer=tf.ones([1, 1, 1, self.filters]))
                W1 = tf.tile(W1, [self.batch_size, top_size, 1, 1])
                alpha1 = tf.nn.softmax(tf.matmul(W1, c_topk, transpose_b=True), axis=2)
                answer = tf.matmul(alpha1, c_topk)  # [32,top_size,1,128]

                # question
                W2 = tf.get_variable("W2", initializer=tf.ones([1, 1, self.filters]))
                W2 = tf.tile(W2, [self.batch_size, 1, 1])
                alpha2 = tf.nn.softmax(tf.matmul(W2, q, transpose_b=True), axis=1)
                ques = tf.matmul(alpha2, q)
                ques = tf.tile(tf.expand_dims(ques, 1), [1, top_size, 1, 1])  # [32,top_size,1,128]

                # question & answer
                W3 = tf.get_variable("W3", initializer=tf.ones([1, 1, self.filters, self.filters]))
                W3 = tf.tile(W3, [self.batch_size, top_size, 1, 1])
                y_topk_logits = tf.nn.sigmoid(tf.matmul(ques, tf.matmul(W3, answer, transpose_b=True))) # [32,top_size,1,1]
                y_topk_logits = tf.squeeze(y_topk_logits)  # [32,top_size]

                self.yp1 = tf.squeeze(self.yp1)
                self.yp2 = tf.squeeze(self.yp2)
                coeff1_topk = tf.one_hot(self.yp1, self.c_maxlen, axis=-1) # [32,top_size,400] one-hot
                coeff2_topk = tf.one_hot(self.yp2, self.c_maxlen, axis=-1)
                # [0,1,0,0,0][0,0,0,1,0]->[0,1,1,1,1]-[0,0,0,1,1]->[0,1,1,0,0]+[0,0,0,1,0]->[0,1,1,1,0]
                coeff1_topk_cumsum = tf.cumsum(coeff1_topk, axis=-1)
                coeff2_topk_cumsum = tf.cumsum(coeff2_topk, axis=-1)
                self.y_d = coeff1_topk_cumsum - coeff2_topk_cumsum + coeff2_topk # [32, top_size, 400]

                def clip_for_sigmoid(output):
                    _epsilon = tf.convert_to_tensor(1e-7, dtype=output.dtype.base_dtype)
                    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
                    output = tf.log(output / (1 - output))
                    return output

                if self.topk_loss=='f1':
                    # f1 loss
                    y_start_ind = tf.cumsum(self.y_start, axis=-1)
                    y_end_ind = tf.cumsum(self.y_end, axis=-1)
                    y_gtd = y_start_ind - y_end_ind + self.y_end # [32, 400]
                    def cal_num_same(y_pred, y_truth): # [top_size, 400] [400,]
                        def cal_num_same_(y_pred_, y_truth): # [400,] [400,]
                            return tf.reduce_sum(tf.cast(tf.logical_and(tf.cast(y_pred_, tf.bool), tf.cast(y_truth, tf.bool)), tf.float32),axis=-1)
                        return [tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred),tf.map_fn(lambda x:cal_num_same_(x,y_truth),y_pred)]
                    num_same = tf.map_fn(lambda x:cal_num_same(x[0], x[1]), [self.y_d, y_gtd])[0] # [32, top_size]
                    y_precision = num_same / (tf.cast(tf.reduce_sum(self.y_d, axis=-1),tf.float32) + 1e-8) # [32, top_size]
                    y_recall = num_same / tf.expand_dims(tf.cast(tf.reduce_sum(y_gtd, axis=-1),tf.float32) + 1e-8, axis=-1) # [32, top_size]
                    y_f1 = (2.0 * y_precision * y_recall) / (tf.cast(y_precision + y_recall,tf.float32) + 1e-8) # [32, top_size]
                    topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_f1))

                elif self.topk_loss=='em':
                    # em loss
                    start_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_start, axis=-1), axis=1), tf.int32),
                                        tf.cast(self.yp1, tf.int32))  # [32, top_size]
                    end_em = tf.equal(tf.cast(tf.expand_dims(tf.argmax(self.y_end, axis=-1), axis=1), tf.int32),
                                      tf.cast(self.yp2, tf.int32))  # [32, top_size]
                    y_em = tf.cast(tf.logical_and(start_em, end_em), tf.float32) # [32, top_size]
                    topk_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=clip_for_sigmoid(y_topk_logits), labels=y_em))

                # final loss
                self.Lambda1 = tf.get_variable("Lambda1", initializer=tf.constant([0.9]), trainable=False)
                self.loss = tf.reduce_mean(self.Lambda1 * (start_loss + end_loss) + (1 - self.Lambda1) * topk_loss)

                # output
                outer_topk = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                                  tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
                outer_topk = tf.matrix_band_part(outer_topk, 0, self.ans_limit)
                self.output1 = tf.argmax(tf.reduce_max(outer_topk, axis=2), axis=1)
                self.output2 = tf.argmax(tf.reduce_max(outer_topk, axis=1), axis=1)

                # diversity loss
                if self.diversity_loss:
                    self.Lambda2 = tf.get_variable("Lambda2", initializer=tf.constant([0.1]),trainable=False)
                    diversity_loss = tf.reduce_mean(tf.reduce_prod(self.y_d, axis=1),axis=-1) # [32,top_size,400]->[32,400]->[32,]
                    self.loss = self.loss + tf.reduce_mean(self.Lambda2 * diversity_loss)


        if self.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if self.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(self.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)
                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v is not None:
                        self.assign_vars.append(tf.assign(var, v))