Пример #1
0
    def _fuse(self):

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2),
                        [1, 1, self.max_q_len, 1])
            Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1),
                        [1, self.max_p_len, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, self.q_embed_encoding)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding)
            self.attention_outputs = [
                self.c_embed_encoding, self.c2q,
                self.c_embed_encoding * self.c2q,
                self.c_embed_encoding * self.q2c
            ]

        #  self.config.batch_size if not self.demo else 1,
        #  self.max_p_len,
        #  self.max_q_len,
        #  self.config.max_ch_len,
        #  self.config.hidden_size,
        #  self.config.char_embed_size,
        #  self.config.head_size
        N, PL, QL, CL, d, dc, nh = self._params()
        if self.config.fix_pretrained_vector:
            dc = self.char_mat.get_shape()[-1]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(self.attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=1,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=True,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

            for i, item in enumerate(self.enc):
                self.enc[i] = tf.reshape(self.enc[i],
                                         [N, -1, self.enc[i].get_shape()[-1]])
Пример #2
0
    def _fuse(self):

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2),
                        [1, 1, self.max_q_len, 1])
            Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1),
                        [1, self.max_p_len, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, self.q_embed_encoding)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding)
            self.attention_outputs = [
                self.c_embed_encoding, self.c2q,
                self.c_embed_encoding * self.c2q,
                self.c_embed_encoding * self.q2c
            ]

        PL, QL, CL, d, dc, nh = self._params()
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(self.attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=3,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))
Пример #3
0
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1])
            Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q]
            if config.q2c:
                attention_outputs.append(c * self.q2c)

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
Пример #4
0
    def forward(self):
        self.c_words = tf.placeholder(tf.int32,
                                      [None, self.config.context_len],
                                      'context-words')
        self.c_chars = tf.placeholder(
            tf.int32,
            [None, self.config.context_len, self.config.max_char_len],
            'context-chars')
        self.c_mask = tf.sign(self.c_words)

        self.q_words = tf.placeholder(tf.int32,
                                      [None, self.config.question_len],
                                      'query-words')
        self.q_chars = tf.placeholder(
            tf.int32,
            [None, self.config.question_len, self.config.max_char_len],
            'query-chars')
        self.q_mask = tf.sign(self.q_words)

        self.c_len = tf.cast(tf.reduce_sum(self.c_mask, -1), tf.int32)
        self.q_len = tf.cast(tf.reduce_sum(self.q_mask, -1), tf.int32)

        self.start = tf.placeholder(tf.int32, [None], 'start-index')
        self.end = tf.placeholder(tf.int32, [None], 'end-index')

        with tf.variable_scope('input-embedding'):
            c_w = tf.nn.embedding_lookup(self.word_embed, self.c_words)
            q_w = tf.nn.embedding_lookup(self.word_embed, self.q_words)

            c_ch = layers.char_embed(self.c_chars,
                                     self.char_embed,
                                     dropout=self.dropout)
            q_ch = layers.char_embed(self.q_chars,
                                     self.char_embed,
                                     dropout=self.dropout,
                                     reuse=True)

            c = tf.concat([c_w, c_ch], -1)
            q = tf.concat([q_w, q_ch], -1)

        with tf.variable_scope('rnn'):
            c_rnn = layers.birnn(c, self.c_len, self.config.cell_size,
                                 self.config.cell_type, self.dropout)
            q_rnn = layers.birnn(q,
                                 self.q_len,
                                 self.config.cell_size,
                                 self.config.cell_type,
                                 self.dropout,
                                 reuse=True)

        with tf.variable_scope('attention'):
            attention = layers.bi_attention(c_rnn, q_rnn,
                                            layers.trilinear(c_rnn, q_rnn),
                                            self.c_mask, self.q_mask)
            attention = tf.layers.conv1d(attention,
                                         self.config.cell_size * 2,
                                         1,
                                         padding='same')

        with tf.variable_scope('memory1'):
            memory1 = layers.birnn(attention, self.c_len,
                                   self.config.cell_size,
                                   self.config.cell_type, self.dropout)

        with tf.variable_scope('self-attention') as scope:
            x = memory1
            self_attention = layers.bi_attention(x,
                                                 x,
                                                 layers.trilinear(x, x),
                                                 self.c_mask,
                                                 self.c_mask,
                                                 only_c2q=True)
            res = tf.layers.dense(self_attention,
                                  self.config.cell_size * 2,
                                  activation=tf.nn.relu)
            res = tf.layers.dropout(res,
                                    rate=self.config.dropout,
                                    training=self.config.training)

            res += attention

        with tf.variable_scope('memory2'):
            memory2 = layers.birnn(res, self.c_len, self.config.cell_size,
                                   self.config.cell_type, self.dropout)

        with tf.variable_scope('start-index') as scope:
            self.start_linear = tf.squeeze(tf.layers.dense(memory2, 1), -1)
            self.pred_start = tf.nn.softmax(self.start_linear)

        with tf.variable_scope('end-index') as scope:
            end_input = tf.concat(
                [tf.expand_dims(self.start_linear, -1), memory2], -1)
            memory3 = layers.birnn(end_input, self.c_len,
                                   self.config.cell_size,
                                   self.config.cell_type, self.dropout)

            self.end_linear = tf.squeeze(tf.layers.dense(memory3, 1), -1)
            self.pred_end = tf.nn.softmax(self.end_linear)

        with tf.variable_scope('loss') as scope:
            loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.start_linear, labels=self.start)
            loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.end_linear, labels=self.end)
            loss = tf.reduce_mean(loss1 + loss2)
            lossL2 = tf.add_n([
                tf.nn.l2_loss(v)
                for v in tf.trainable_variables() if 'bias' not in v.name
            ]) * self.config.l2
            self.loss = loss + lossL2

        with tf.variable_scope('optimizer') as scope:
            optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            grads = tf.gradients(self.loss, tf.trainable_variables())
            grads, _ = tf.clip_by_global_norm(grads, self.config.grad_clip)
            grads_and_vars = zip(grads, tf.trainable_variables())
            self.optimize = optimizer.apply_gradients(
                grads_and_vars, global_step=self.global_step)

        if self.config.ema_decay > 0:
            with tf.variable_scope('ema') as scope:
                ema = tf.train.ExponentialMovingAverage(
                    decay=self.config.ema_decay)
                ema_op = ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)
                    assign_vars = []
                    for var in tf.global_variables():
                        v = ema.average(var)
                        if v:
                            assign_vars.append(tf.assign(var, v))
                self.assign_vars = assign_vars
Пример #5
0
    def forward(self):
        config = self.config
        N = config.batch_size if not self.demo else 1
        PL = self.c_maxlen
        QL = self.q_maxlen
        CL = config.char_limit  # 16
        d = config.hidden  # 96
        dc = config.char_dim  # 64
        nh = config.num_heads  # 1

        with tf.variable_scope("Input_Embedding_Layer"):
            '''
                self.ch : (N, c_maxlen, 16)
                self.qh : (N, q_maxlen, 16)
            '''
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])  # (N*c_maxlen, 16, 64)
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])  # (N*q_maxlen, 16, 64)
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)  # (N*c_maxlen, 16-5+1, 96)
            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)  # (N*q_maxlen, 16-5+1, 96)

            ch_emb = tf.reduce_max(ch_emb, axis=1)  # (N*c_maxlen, 96)
            qh_emb = tf.reduce_max(qh_emb, axis=1)  # (N*q_maxlen, 96)

            ch_emb = tf.reshape(ch_emb,
                                [N, PL, ch_emb.shape[-1]])  # (N, c_maxlen, 96)
            qh_emb = tf.reshape(qh_emb,
                                [N, QL, ch_emb.shape[-1]])  # (N, q_maxlen, 96)
            '''
                self.c : (N, c_maxlen)
                self.q : (N, q_maxlen)
            '''
            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.c),
                                  1.0 - self.dropout)  # (N, c_maxlen, 300)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(
                self.word_mat, self.q),
                                  1.0 - self.dropout)  # (N, q_maxlen, 300)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)  # (N, c_maxlen, 396)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)  # (N, q_maxlen, 396)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)  # (N, c_maxlen, 96)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)  # (N, q_maxlen, 96)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            '''
                -> positional encoding 
                -> layer_normalization 
                -> depth-wise separable convolution 
                -> self attention 
                -> feed forward network
                In the paper: The total number of encoder blocks is 1
            '''
            # (N, c_maxlen, 96)
            c = residual_block(c_emb,
                               num_blocks=1,
                               num_conv_layers=4,
                               kernel_size=7,
                               mask=self.c_mask,
                               num_filters=d,
                               num_heads=nh,
                               seq_len=self.c_len,
                               scope="Encoder_Residual_Block",
                               bias=False,
                               dropout=self.dropout)
            # (N, q_maxlen, 96)
            q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            '''
                tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. 
                    The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, 
                    and the values of input are replicated multiples[i] times along the 'i'th dimension.
                Paper: The layer parameters are the same as the Embedding Encoder Layer 
                       except that convolution layer number is 2 within a block 
                       and the total number of blocks is 7
            '''
            '''
                c:        (N, c_maxlen, d)
                q:        (N, q_maxlen, d)
                ch_emb:   (N, c_maxlen, d)
                qh_emb:   (N, q_maxlen, d)
                C:        (N, c_maxlen, q_maxlen, d)
                Q:        (N, c_maxlen, q_maxlen, d)
                S:        (N, c_maxlen, q_maxlen)
                mask_q:   (N, 1, q_maxlen)
                mask_c:   (N, c_maxlen, 1)
                S_:       (N, c_maxlen, q_maxlen)
                S_T:      (N, q_maxlen, c_maxlen)
                self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q)
                self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c)
            '''
            C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1])
            Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1])
            S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout)

            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1))

            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)

            attention_outputs = [c, self.c2q, c * self.c2q]
            if config.q2c:
                attention_outputs.append(c * self.q2c)

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d,
                             name="input_projection")]  # d=hidden=96
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                                   num_blocks=7,
                                   num_conv_layers=2,
                                   kernel_size=5,
                                   mask=self.c_mask,
                                   num_filters=d,
                                   num_heads=nh,
                                   seq_len=self.c_len,
                                   scope="Model_Encoder",
                                   bias=False,
                                   reuse=True if i > 0 else None,
                                   dropout=self.dropout))

        with tf.variable_scope("Output_Layer"):
            '''
                tf.matrix_band_part: Copy a tensor setting everything outside a central band 
                                     in each innermost matrix to zero.
                self.enc[i]:  (N, c_maxlen, d)
                start_logits: (N, c_maxlen)
                end_logits:   (N, c_maxlen)
                logits1:      (N, c_maxlen)
                logits2:      (N, c_maxlen)
                outer:        (N, c_maxlen, c_maxlen)
                yp1, yp2, losses, losses2: (N,)  
            '''
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

            # find max-score span
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            #DEBUG
            self.debug_ops.extend([
                self.enc[1], start_logits, end_logits, logits1, logits2, outer,
                self.yp1, self.yp2, losses, losses2, self.loss
            ])

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.shadow_vars = []
                self.global_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.shadow_vars.append(v)
                        self.global_vars.append(var)
                self.assign_vars = []
                for g, v in zip(self.global_vars, self.shadow_vars):
                    self.assign_vars.append(tf.assign(g, v))
Пример #6
0
    def forward(self):
        self.c_words = tf.placeholder(tf.int32,
                                      [None, self.config.context_len],
                                      'context-words')
        self.c_chars = tf.placeholder(
            tf.int32,
            [None, self.config.context_len, self.config.max_char_len],
            'context-chars')
        self.c_mask = tf.sign(self.c_words)

        self.q_words = tf.placeholder(tf.int32,
                                      [None, self.config.question_len],
                                      'query-words')
        self.q_chars = tf.placeholder(
            tf.int32,
            [None, self.config.question_len, self.config.max_char_len],
            'query-chars')
        self.q_mask = tf.sign(self.q_words)

        self.c_len = tf.cast(tf.reduce_sum(self.c_mask, -1), tf.int32)
        self.q_len = tf.cast(tf.reduce_sum(self.q_mask, -1), tf.int32)

        self.start = tf.placeholder(tf.int32, [None], 'start-index')
        self.end = tf.placeholder(tf.int32, [None], 'end-index')

        with tf.variable_scope('input-embedding'):
            c_w = tf.nn.embedding_lookup(self.word_embed, self.c_words)
            q_w = tf.nn.embedding_lookup(self.word_embed, self.q_words)

            c_ch = layers.char_embed(self.c_chars,
                                     self.char_embed,
                                     dropout=self.dropout)
            q_ch = layers.char_embed(self.q_chars,
                                     self.char_embed,
                                     dropout=self.dropout,
                                     reuse=True)

            c = tf.concat([c_w, c_ch], -1)
            q = tf.concat([q_w, q_ch], -1)

        with tf.variable_scope('highway-1'):
            c = layers.highway(c, self.config.embed_size, dropout=self.dropout)
            q = layers.highway(q,
                               self.config.embed_size,
                               dropout=self.dropout,
                               reuse=True)

        with tf.variable_scope('highway-2'):
            c = layers.highway(c, self.config.embed_size, dropout=self.dropout)
            q = layers.highway(q,
                               self.config.embed_size,
                               dropout=self.dropout,
                               reuse=True)

        with tf.variable_scope('projection'):
            c = tf.layers.conv1d(c, self.config.filters, 1, padding='same')
            q = tf.layers.conv1d(q,
                                 self.config.filters,
                                 1,
                                 padding='same',
                                 reuse=True)

        with tf.variable_scope('input-encoder'):
            c = layers.encoder_block(c,
                                     num_blocks=1,
                                     num_convolutions=4,
                                     kernel=7,
                                     mask=self.c_mask,
                                     dropout=self.dropout)

            q = layers.encoder_block(q,
                                     num_blocks=1,
                                     num_convolutions=4,
                                     kernel=7,
                                     mask=self.q_mask,
                                     dropout=self.dropout,
                                     reuse=True)

        with tf.variable_scope('attention'):
            attention = layers.bi_attention(c, q, layers.trilinear(c, q),
                                            self.c_mask, self.q_mask)
            attention = tf.layers.conv1d(attention,
                                         self.config.filters,
                                         1,
                                         padding='same')

        modeling = [attention]
        for i in range(3):
            reuse = i > 0
            m = layers.encoder_block(modeling[i],
                                     num_blocks=7,
                                     num_convolutions=2,
                                     kernel=5,
                                     mask=self.c_mask,
                                     dropout=self.dropout,
                                     reuse=reuse)
            if i % 2 == 0:
                m = tf.nn.dropout(m, 1.0 - self.dropout)
            modeling.append(m)

        with tf.variable_scope('start-index') as scope:
            self.start_linear = tf.concat([modeling[-3], modeling[-2]], -1)
            self.start_linear = tf.squeeze(
                tf.layers.dense(self.start_linear, 1, use_bias=False), -1)
            self.pred_start = tf.nn.softmax(self.start_linear,
                                            name='pred-start')

        with tf.variable_scope('end-index') as scope:
            self.end_linear = tf.concat([modeling[-3], modeling[-1]], -1)
            self.end_linear = tf.squeeze(
                tf.layers.dense(self.end_linear, 1, use_bias=False), -1)
            self.pred_end = tf.nn.softmax(self.end_linear, name='pred-end')

        with tf.variable_scope('loss') as scope:
            loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.start_linear, labels=self.start)
            loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.end_linear, labels=self.end)
            loss = tf.reduce_mean(loss1 + loss2)
            lossL2 = tf.add_n([
                tf.nn.l2_loss(v)
                for v in tf.trainable_variables() if 'bias' not in v.name
            ]) * self.config.l2
            self.loss = loss + lossL2

        with tf.variable_scope('optimizer') as scope:
            optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            grads = tf.gradients(self.loss, tf.trainable_variables())
            grads, _ = tf.clip_by_global_norm(grads, self.config.grad_clip)
            grads_and_vars = zip(grads, tf.trainable_variables())
            self.optimize = optimizer.apply_gradients(
                grads_and_vars, global_step=self.global_step)

        if self.config.ema_decay > 0:
            with tf.variable_scope('ema') as scope:
                ema = tf.train.ExponentialMovingAverage(
                    decay=self.config.ema_decay)
                ema_op = ema.apply(tf.trainable_variables())
                with tf.control_dependencies([ema_op]):
                    self.loss = tf.identity(self.loss)
                    assign_vars = []
                    for var in tf.global_variables():
                        v = ema.average(var)
                        if v:
                            assign_vars.append(tf.assign(var, v))
                self.assign_vars = assign_vars