Exemplo n.º 1
0
def text_attention(inputs, att_dim, sequence_lengths, scope=None):
    assert len(
        inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None

    D_w = get_shape(inputs)[-1]
    N_w = get_shape(inputs)[-2]

    with tf.variable_scope(scope or 'text_attention'):
        W = tf.get_variable('W', shape=[D_w, att_dim])
        b = tf.get_variable('b', shape=[att_dim])
        input_proj = tf.nn.tanh(
            tf.matmul(tf.reshape(inputs, [-1, D_w]), W) + b)

        word_att_W = tf.get_variable(name='word_att_W', shape=[att_dim, 1])
        alpha = tf.matmul(input_proj, word_att_W)
        alpha = tf.reshape(alpha, shape=[-1, N_w])
        alpha = mask_score(alpha, sequence_lengths,
                           tf.constant(-1e15, dtype=tf.float32))
        alpha = tf.nn.softmax(alpha)

        outputs = tf.reduce_sum(inputs * tf.expand_dims(alpha, 2), axis=1)
        return outputs, alpha
Exemplo n.º 2
0
    def _init_sent_encoder(self):
        with tf.variable_scope('sentence') as scope:
            sentence_rnn_inputs = tf.reshape(
                self.word_outputs,
                [-1, self.max_num_sents, 2 * self.hidden_dim])

            # sentence encoder
            cell_fw = rnn.GRUCell(self.hidden_dim)
            cell_bw = rnn.GRUCell(self.hidden_dim)

            init_state_fw = tf.tile(
                tf.get_variable('init_state_fw',
                                shape=[1, self.hidden_dim],
                                initializer=tf.constant_initializer(1.0)),
                multiples=[get_shape(sentence_rnn_inputs)[0], 1])
            init_state_bw = tf.tile(
                tf.get_variable('init_state_bw',
                                shape=[1, self.hidden_dim],
                                initializer=tf.constant_initializer(1.0)),
                multiples=[get_shape(sentence_rnn_inputs)[0], 1])

            sentence_rnn_outputs, _ = bidirectional_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=sentence_rnn_inputs,
                input_lengths=self.document_lengths,
                initial_state_fw=init_state_fw,
                initial_state_bw=init_state_bw,
                scope=scope)

            self.sentence_outputs, self.sent_att_weights, self.img_att_weights = visual_aspect_attention(
                text_input=sentence_rnn_outputs,
                visual_input=self.images,
                att_dim=self.att_dim,
                sequence_lengths=self.document_lengths)

            self.sentence_outputs = tf.nn.dropout(
                self.sentence_outputs, keep_prob=self.dropout_keep_prob)
Exemplo n.º 3
0
    def _init_word_encoder(self):
        with tf.variable_scope('word') as scope:
            word_rnn_inputs = tf.reshape(
                self.embedded_inputs, [-1, self.max_num_words, self.emb_size])
            sentence_lengths = tf.reshape(self.sentence_lengths, [-1])

            # word encoder
            cell_fw = rnn.GRUCell(self.hidden_dim)
            cell_bw = rnn.GRUCell(self.hidden_dim)

            init_state_fw = tf.tile(
                tf.get_variable('init_state_fw',
                                shape=[1, self.hidden_dim],
                                initializer=tf.constant_initializer(1.0)),
                multiples=[get_shape(word_rnn_inputs)[0], 1])
            init_state_bw = tf.tile(
                tf.get_variable('init_state_bw',
                                shape=[1, self.hidden_dim],
                                initializer=tf.constant_initializer(1.0)),
                multiples=[get_shape(word_rnn_inputs)[0], 1])

            word_rnn_outputs, _ = bidirectional_rnn(
                cell_fw=cell_fw,
                cell_bw=cell_bw,
                inputs=word_rnn_inputs,
                input_lengths=sentence_lengths,
                initial_state_fw=init_state_fw,
                initial_state_bw=init_state_bw,
                scope=scope)

            self.word_outputs, self.word_att_weights = text_attention(
                inputs=word_rnn_outputs,
                att_dim=self.att_dim,
                sequence_lengths=sentence_lengths)

            self.word_outputs = tf.nn.dropout(self.word_outputs,
                                              keep_prob=self.dropout_keep_prob)
Exemplo n.º 4
0
def visual_aspect_attention(
        text_input,  # (b, n_s, d_text)
        visual_input,  # (b, n_i, d)
        att_dim,  # d
        sequence_lengths,
        scope='visual_aspect_attention'):
    assert len(text_input.get_shape()) == 3 and text_input.get_shape(
    )[-1].value is not None
    assert len(visual_input.get_shape()) == 3 and visual_input.get_shape(
    )[-1].value is not None

    D_t = get_shape(text_input)[-1]
    N_s = get_shape(text_input)[-2]
    D_i = get_shape(visual_input)[-1]
    N_i = get_shape(visual_input)[-2]

    with tf.variable_scope(scope):
        # Sentence-level attention
        W_s = tf.get_variable('W_s', shape=[D_t, att_dim])
        b_s = tf.get_variable('b_s', shape=[att_dim])
        text_input = tf.reshape(text_input, [-1, D_t])
        q = tf.nn.tanh(tf.matmul(text_input, W_s) + b_s)
        q = tf.reshape(q, [-1, 1, N_s, att_dim])

        W_i = tf.get_variable('W_i', shape=[D_i, att_dim])
        b_i = tf.get_variable('b_i', shape=[att_dim])
        visual_input = tf.reshape(visual_input, [-1, D_i])
        p = tf.nn.tanh(tf.matmul(visual_input, W_i) + b_i)
        p = tf.reshape(p, [-1, N_i, 1, att_dim])

        context = tf.multiply(q, p) + q
        context = tf.reshape(context, [-1, att_dim])

        sent_att_W = tf.get_variable(name='sent_att_W', shape=[att_dim, 1])

        beta = tf.matmul(context, sent_att_W)
        beta = tf.reshape(beta, shape=[-1, N_s])

        sequence_lengths = tf.tile(tf.expand_dims(sequence_lengths, axis=1),
                                   [1, N_i])
        sequence_lengths = tf.reshape(sequence_lengths, [-1])
        beta = mask_score(beta, sequence_lengths,
                          tf.constant(-1e15, dtype=tf.float32))
        beta = tf.nn.softmax(beta)

        beta = tf.reshape(beta, [-1, N_i, N_s, 1])
        text_input = tf.reshape(text_input, [-1, 1, N_s, D_t])
        weighted_docs = tf.reduce_sum(text_input * beta, axis=2)  # (b, n_i, d)

        # Document-level attention
        W_d = tf.get_variable(name='W_d', shape=[D_t, att_dim])
        b_d = tf.get_variable(name='b_d', shape=[1])
        weighted_docs = tf.reshape(weighted_docs, [-1, D_t])
        doc_proj = tf.nn.tanh(tf.matmul(weighted_docs, W_d) + b_d)

        doc_att_W = tf.get_variable(name='doc_att_W', shape=[att_dim, 1])

        gamma = tf.matmul(doc_proj, doc_att_W)
        gamma = tf.reshape(gamma, shape=[-1, N_i])
        gamma = tf.nn.softmax(gamma)

        weighted_docs = tf.reshape(weighted_docs, [-1, N_i, D_t])
        final_outputs = tf.reduce_sum(weighted_docs * tf.expand_dims(gamma, 2),
                                      axis=1)  # (b, d)

        return final_outputs, beta, gamma