예제 #1
0
    def _call(self, inputs):
        self_vecs, neigh_vecs = inputs

        if self.mode == "train":
            neigh_vecs = tf.nn.dropout(neigh_vecs, 1 - self.dropout)
            self_vecs = tf.nn.dropout(self_vecs, 1 - self.dropout)

        # reduce_mean performs better than mean_pool
        neigh_means = tf.reduce_mean(neigh_vecs, axis=1)
        # neigh_means = mean_pool(neigh_vecs, neigh_len)

        # [nodes] x [out_dim]
        from_neighs = tf.matmul(neigh_means, self.vars['neigh_weights'])

        if self.if_use_high_way:
            with tf.variable_scope("fw_hidden_highway"):
                fw_hidden = multi_highway_layer(from_neighs,
                                                self.neigh_input_dim, 1)

        from_self = tf.matmul(self_vecs, self.vars["self_weights"])

        if not self.concat:
            output = tf.add_n([from_self, from_neighs])
        else:
            output = tf.concat([from_self, from_neighs], axis=1)

        # bias
        if self.bias:
            output += self.vars['bias']

        return self.act(output), self.output_dim
예제 #2
0
	def build_network(self):
		self.options = self.config["options"]
		self.options["batch_size"] = self.batch_size
		self.highway_layer_num = self.options["highway_layer_num"]
		self.with_highway = self.options["with_highway"]
		self.wd = self.config.get("weight_decay", None)
		self.l2_reg = float(self.config["l2_reg"])

		in_question_repres = tf.nn.dropout(self.s1_emb, self.dropout_keep_prob)
		in_passage_repres = tf.nn.dropout(self.s2_emb, self.dropout_keep_prob)

		input_dim = self.emb_size

		# ======Highway layer======
		if self.with_highway:
			with tf.variable_scope(self.scope+"-input_highway"):
				in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, self.highway_layer_num)
				tf.get_variable_scope().reuse_variables()
				in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, self.highway_layer_num)

		# ========Bilateral Matching=====
		with tf.variable_scope(self.scope+"-bilateral_matching"): 
			(match_representation, match_dim) = match_utils.bilateral_match_func(
						in_question_repres, in_passage_repres,
						self.sent1_token_len, self.sent2_token_len, 
						self.sent1_token_mask, self.sent2_token_mask, input_dim, self.config["mode"], 
						options=self.options, dropout_rate=self.dropout_keep_prob)
			self.output_features = match_representation

		#========Prediction Layer=========
		with tf.variable_scope(self.scope+"-prediction"): 
			# match_dim = 4 * self.options.aggregation_lstm_dim
			w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32)
			b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32)
			w_1 = tf.get_variable("w_1", [match_dim/2, self.num_classes],dtype=tf.float32)
			b_1 = tf.get_variable("b_1", [self.num_classes],dtype=tf.float32)

			# if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
			logits = tf.matmul(match_representation, w_0) + b_0
			logits = tf.tanh(logits)
			logits = tf.nn.dropout(logits, (self.dropout_keep_prob))
			self.estimation = tf.matmul(logits, w_1) + b_1

			self.pred_probs = tf.contrib.layers.softmax(self.estimation)
			self.logits = tf.cast(tf.argmax(self.pred_probs, -1), tf.int32)

			match_utils.add_reg_without_bias(self.scope)
예제 #3
0
 def add_highway_layer(self,
                       highway_layer_num,
                       tied_aggre=False,
                       reuse_question=None,
                       reuse_choice=None):
     # Add highway layer on top of matching layer
     if tied_aggre:
         name = 'matching_highway'
     else:
         name = 'matching_highway_{}'.format(self.matching_id)
     if self.question_repre_dim > 0:
         with tf.variable_scope("{}_ques".format(name),
                                reuse=reuse_question):
             self.question_repre = multi_highway_layer(
                 self.question_repre, self.question_repre_dim,
                 highway_layer_num)
     if self.choice_repre_dim > 0:
         with tf.variable_scope("{}_choice".format(name),
                                reuse=reuse_choice):
             self.choice_repre = multi_highway_layer(
                 self.choice_repre, self.choice_repre_dim,
                 highway_layer_num)
예제 #4
0
 def add_aggregation_highway(self,
                             highway_layer_num,
                             tied_aggre=False,
                             reuse=None):
     # Add aggregation highway layer (after aggregation LSTM)
     # if tied_aggre:
     name = 'aggre_highway'
     # else:
     #     name='aggre_highway_{}'.format(self.matching_id)
     with tf.variable_scope(name, reuse=reuse):
         agg_shape = tf.shape(self.aggregation_representation)
         batch_size = agg_shape[0]
         self.aggregation_representation = tf.reshape(
             self.aggregation_representation,
             [1, batch_size, self.aggregation_dim])
         self.aggregation_representation = multi_highway_layer(
             self.aggregation_representation, self.aggregation_dim,
             highway_layer_num)
         self.aggregation_representation = tf.reshape(
             self.aggregation_representation,
             [batch_size, self.aggregation_dim])
예제 #5
0
    def encode(self, is_training=True):
        options = self.options

        # ======word representation layer======
        in_passage_repres = []
        input_dim = 0
        if options.with_word and self.word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.variable_scope("embedding"), tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(self.word_vocab.word_vecs),
                    dtype=tf.float32)

            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding, self.in_passage_words)
            # [batch_size, passage_len, word_dim]
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_passage_words)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            input_dim += self.word_vocab.word_dim

        if options.with_char and self.char_vocab is not None:
            input_shape = tf.shape(self.in_passage_chars)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = self.char_vocab.word_dim
            self.char_embedding = tf.get_variable(
                "char_embedding",
                initializer=tf.constant(self.char_vocab.word_vecs),
                dtype=tf.float32)

            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(
                    options.char_lstm_dim)
                # dropout
                if is_training:
                    char_lstm_cell = tf.contrib.rnn.DropoutWrapper(
                        char_lstm_cell,
                        output_keep_prob=(1 - options.dropout_rate))
                char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell])

                # passage representation
                passage_char_outputs = tf.nn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32)[0]
                # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = collect_final_step_lstm(
                    passage_char_outputs, passage_char_lengths - 1)
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, options.char_lstm_dim])

            in_passage_repres.append(passage_char_outputs)
            input_dim += options.char_lstm_dim

        if options.with_POS and self.POS_vocab is not None:
            self.POS_embedding = tf.get_variable("POS_embedding",
                                                 initializer=tf.constant(
                                                     self.POS_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_passage_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_passage_POSs)  # [batch_size, passage_len, POS_dim]
            in_passage_repres.append(in_passage_POS_repres)

            input_shape = tf.shape(self.in_passage_POSs)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            input_dim += self.POS_vocab.word_dim

        if options.with_NER and self.NER_vocab is not None:
            self.NER_embedding = tf.get_variable("NER_embedding",
                                                 initializer=tf.constant(
                                                     self.NER_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_passage_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_passage_NERs)  # [batch_size, passage_len, NER_dim]
            in_passage_repres.append(in_passage_NER_repres)

            input_shape = tf.shape(self.in_passage_NERs)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            input_dim += self.NER_vocab.word_dim

        in_passage_repres = tf.concat(in_passage_repres,
                                      2)  # [batch_size, passage_len, dim]

        if options.compress_input:  # compress input word vector into smaller vectors
            w_compress = tf.get_variable(
                "w_compress_input", [input_dim, options.compress_input_dim],
                dtype=tf.float32)
            b_compress = tf.get_variable("b_compress_input",
                                         [options.compress_input_dim],
                                         dtype=tf.float32)

            in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim])
            in_passage_repres = tf.matmul(in_passage_repres,
                                          w_compress) + b_compress
            in_passage_repres = tf.tanh(in_passage_repres)
            in_passage_repres = tf.reshape(
                in_passage_repres,
                [batch_size, passage_len, options.compress_input_dim])
            input_dim = options.compress_input_dim

        if is_training:
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        passage_mask = tf.sequence_mask(
            self.passage_lengths, passage_len,
            dtype=tf.float32)  # [batch_size, passage_len]

        # sequential context matching
        passage_forward = None
        passage_backward = None
        all_passage_representation = []
        passage_dim = 0
        with_lstm = True
        if with_lstm:
            with tf.variable_scope('biLSTM'):
                cur_in_passage_repres = in_passage_repres
                for i in xrange(options.context_layer_num):
                    with tf.variable_scope('layer-{}'.format(i)):
                        with tf.variable_scope('context_represent'):
                            # parameters
                            context_lstm_cell_fw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            context_lstm_cell_bw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            if is_training:
                                context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_fw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))
                                context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_bw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))

                            # passage representation
                            ((passage_context_representation_fw,
                              passage_context_representation_bw),
                             (passage_forward, passage_backward
                              )) = tf.nn.bidirectional_dynamic_rnn(
                                  context_lstm_cell_fw,
                                  context_lstm_cell_bw,
                                  cur_in_passage_repres,
                                  dtype=tf.float32,
                                  sequence_length=self.passage_lengths
                              )  # [batch_size, passage_len, context_lstm_dim]
                            # [batch_size, passage_len, 2*context_lstm_dim]
                            cur_in_passage_repres = tf.concat([
                                passage_context_representation_fw,
                                passage_context_representation_bw
                            ], 2)
                            passage_dim += 2 * options.context_lstm_dim
                            all_passage_representation.append(
                                cur_in_passage_repres)

        all_passage_representation = tf.concat(
            all_passage_representation,
            2)  # [batch_size, passage_len, L*passage_dim]

        # ======Highway layer======
        if options.with_match_highway:
            with tf.variable_scope("context_highway"):
                all_passage_representation = match_utils.multi_highway_layer(
                    all_passage_representation, passage_dim,
                    options.highway_layer_num)

        all_passage_representation = all_passage_representation * tf.expand_dims(
            passage_mask, axis=-1)

        # initial state for the LSTM decoder
        #'''
        with tf.variable_scope('initial_state_for_decoder'):
            # Define weights and biases to reduce the cell and reduce the state
            w_reduce_c = tf.get_variable(
                'w_reduce_c',
                [2 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            w_reduce_h = tf.get_variable(
                'w_reduce_h',
                [2 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            bias_reduce_c = tf.get_variable('bias_reduce_c',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)
            bias_reduce_h = tf.get_variable('bias_reduce_h',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)

            old_c = tf.concat(values=[passage_forward.c, passage_backward.c],
                              axis=1)
            old_h = tf.concat(values=[passage_forward.h, passage_backward.h],
                              axis=1)
            new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c)
            new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h)

            init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
        '''
        new_c = tf.zeros([batch_size, options.gen_hidden_size])
        new_h = tf.zeros([batch_size, options.gen_hidden_size])
        init_state = LSTMStateTuple(new_c, new_h)
        '''
        return (passage_dim, all_passage_representation, init_state)
예제 #6
0
    def __init__(self,
                 num_classes,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None,
                 dropout_rate=0.5,
                 learning_rate=0.001,
                 optimize_type='adam',
                 lambda_l2=1e-5,
                 with_word=True,
                 with_char=True,
                 with_POS=True,
                 with_NER=True,
                 char_lstm_dim=20,
                 context_lstm_dim=100,
                 aggregation_lstm_dim=200,
                 is_training=True,
                 filter_layer_threshold=0.2,
                 MP_dim=50,
                 context_layer_num=1,
                 aggregation_layer_num=1,
                 fix_word_vec=False,
                 with_filter_layer=True,
                 with_highway=False,
                 word_level_MP_dim=-1,
                 sep_endpoint=False,
                 end_model_combine=False,
                 with_match_highway=False,
                 with_aggregation_highway=False,
                 highway_layer_num=1,
                 match_to_passage=True,
                 match_to_question=False,
                 match_to_choice=False,
                 with_no_match=False,
                 with_full_match=True,
                 with_maxpool_match=True,
                 with_attentive_match=True,
                 with_max_attentive_match=True,
                 use_options=False,
                 num_options=-1,
                 verbose=False,
                 matching_option=0,
                 concat_context=False,
                 tied_aggre=False,
                 rl_training_method='contrastive',
                 rl_matches=None,
                 cond_training=False,
                 reasonet_training=False,
                 reasonet_steps=5,
                 reasonet_hidden_dim=128,
                 reasonet_lambda=10,
                 reasonet_terminate_mode='original',
                 reasonet_keep_first=False,
                 efficient=False,
                 reasonet_logit_combine='sum',
                 tied_match=False):
        ''' Matching Options:
        0:a1=q->p, a2=c->p, [concat(a1->a2,a2->a1)]
        1:a1=q->p, a2=c->p, [a1->a2,a2->a1]
        2:[q->p,c->p]
        3:a1=p->q, a2=p->c, [a1->a2,a2->a1]
        4:[q->p,p->q,p->c]
        5:a1=q->p, a2=p->q, a3=p->c,[a3->a1,a3->a2]
        6:[p->q,p->c]
        7: Gated matching
            concat_context: Concat question & choice and feed into context LSTM
            tied_aggre: aggregation layer weights are tied.
            training_method: contrastive reward or policy gradient or soft voting
        Efficiency options:
        cond_training: use a tensorflow boolean to control whether to dropout
        efficient: the feed_dict will contain each passage only once, with the choice in the format of [num_gates*batch_size, dim]

        RL training method:
        soft_voting: Simple voting training without RL
        contrastive: Basic contrastive reward
        contrastive_imp: Use (r/b-1) instead of (r-b) as in ReasoNet.

        Reasonet module options:
        r_steps: reasonet reading steps
        r_hidden_dim: When calculating distance, the two repre are linear mapped to this dimension.
        lambda: multiplier for the terminate gate
        terminate_mode: original for using 0-1 terminate gate, softmax for using a softmax over all possible steps
        keep_first: feed reasonet step 0 (the initial state) into the prediction module
        logit_combine: When deciding whether to stop reading on a question, use voting from all questions (sum) 
                       or max activation of all questions(max_pooling) 
        tied_match: Matching layer weights are tied.

        '''
        reasonet_calculated_steps = reasonet_steps + 1 if reasonet_keep_first else reasonet_steps

        # ======word representation layer======

        in_question_repres = []
        in_passage_repres = []
        in_choice_repres = []
        self.question_lengths = tf.placeholder(tf.int32, [None])
        self.passage_lengths = tf.placeholder(tf.int32, [None])
        self.choice_lengths = tf.placeholder(tf.int32, [None])
        self.truth = tf.placeholder(tf.int32, [None])  # [batch_size]
        if cond_training:
            self.is_training = tf.placeholder(tf.bool, [])
        else:
            self.is_training = is_training
        self.concat_idx_mat = None
        self.split_idx_mat_q = None
        self.split_idx_mat_c = None
        if matching_option == 7:
            self.concat_idx_mat = tf.placeholder(tf.int32, [None, None, 2],
                                                 name='concat_idx_mat')
            if concat_context:
                self.split_idx_mat_q = tf.placeholder(tf.int32,
                                                      [None, None, 2])
                self.split_idx_mat_c = tf.placeholder(tf.int32,
                                                      [None, None, 2])
        input_dim = 0
        if with_word and word_vocab is not None:
            self.in_question_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.in_choice_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            print('!!!shape=', word_vocab.word_vecs.shape)
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(word_vocab.word_vecs),
                    dtype=tf.float32)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_choice_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_choice_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)
            in_choice_repres.append(in_choice_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_shape = tf.shape(self.in_choice_words)
            choice_len = input_shape[1]
            input_dim += word_vocab.word_dim

        if with_POS and POS_vocab is not None:
            self.in_question_POSs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_POSs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32)
            self.POS_embedding = tf.get_variable("POS_embedding",
                                                 initializer=tf.constant(
                                                     POS_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_question_POSs)  # [batch_size, question_len, POS_dim]
            in_passage_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_passage_POSs)  # [batch_size, passage_len, POS_dim]
            in_question_repres.append(in_question_POS_repres)
            in_passage_repres.append(in_passage_POS_repres)

            input_shape = tf.shape(self.in_question_POSs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_POSs)
            passage_len = input_shape[1]
            input_dim += POS_vocab.word_dim

        if with_NER and NER_vocab is not None:
            self.in_question_NERs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_NERs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32)
            self.NER_embedding = tf.get_variable("NER_embedding",
                                                 initializer=tf.constant(
                                                     NER_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_question_NERs)  # [batch_size, question_len, NER_dim]
            in_passage_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_passage_NERs)  # [batch_size, passage_len, NER_dim]
            in_question_repres.append(in_question_NER_repres)
            in_passage_repres.append(in_passage_NER_repres)

            input_shape = tf.shape(self.in_question_NERs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_NERs)
            passage_len = input_shape[1]
            input_dim += NER_vocab.word_dim

        if with_char and char_vocab is not None:
            self.question_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.passage_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.choice_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.in_question_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, question_len, q_char_len]
            self.in_passage_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, passage_len, p_char_len]
            self.in_choice_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, passage_len, p_char_len]
            input_shape = tf.shape(self.in_question_chars)
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            input_shape = tf.shape(self.in_choice_chars)
            batch_size = input_shape[0]
            choice_len = input_shape[1]
            c_char_len = input_shape[2]

            char_dim = char_vocab.word_dim

            #             self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)
            self.char_embedding = tf.get_variable("char_embedding",
                                                  initializer=tf.constant(
                                                      char_vocab.word_vecs),
                                                  dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_question_chars
            )  # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(
                in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths,
                                               [-1])
            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            in_choice_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_choice_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_choice_char_repres = tf.reshape(
                in_choice_char_repres, shape=[-1, c_char_len, char_dim])
            choice_char_lengths = tf.reshape(self.choice_char_lengths, [-1])

            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(char_lstm_dim)
                # dropout
                if cond_training:
                    char_lstm_cell = SwitchableDropoutWrapper(
                        char_lstm_cell,
                        self.is_training,
                        input_keep_prob=(1 - dropout_rate))
                elif is_training:
                    char_lstm_cell = tf.contrib.rnn.DropoutWrapper(
                        char_lstm_cell, output_keep_prob=(1 - dropout_rate))

                # if is_training: char_lstm_cell = tf.contrib.rnn.DropoutWrapper(char_lstm_cell, output_keep_prob=(1 - dropout_rate))
                char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell])

                # question_representation
                question_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_question_char_repres,
                    sequence_length=question_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                question_char_outputs = question_char_outputs[:, -1, :]
                question_char_outputs = tf.reshape(
                    question_char_outputs, [-1, question_len, char_lstm_dim])

                tf.get_variable_scope().reuse_variables()
                # passage representation
                passage_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = passage_char_outputs[:, -1, :]
                passage_char_outputs = tf.reshape(
                    passage_char_outputs, [-1, passage_len, char_lstm_dim])

                tf.get_variable_scope().reuse_variables()
                # choice representation
                choice_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_choice_char_repres,
                    sequence_length=choice_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                choice_char_outputs = choice_char_outputs[:, -1, :]
                choice_char_outputs = tf.reshape(
                    choice_char_outputs, [-1, choice_len, char_lstm_dim])

            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)
            in_choice_repres.append(choice_char_outputs)

            input_dim += char_lstm_dim

        in_question_repres = tf.concat(in_question_repres,
                                       2)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(in_passage_repres,
                                      2)  # [batch_size, passage_len, dim]
        in_choice_repres = tf.concat(in_choice_repres,
                                     2)  # [batch_size, passage_len, dim]

        if cond_training:
            in_question_repres = match_utils.apply_dropout(
                in_question_repres, self.is_training, dropout_rate)
            in_passage_repres = match_utils.apply_dropout(
                in_passage_repres, self.is_training, dropout_rate)
            in_choice_repres = match_utils.apply_dropout(
                in_choice_repres, self.is_training, dropout_rate)
        elif is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - dropout_rate))
            in_choice_repres = tf.nn.dropout(in_choice_repres,
                                             (1 - dropout_rate))
        else:
            in_question_repres = tf.multiply(in_question_repres,
                                             (1 - dropout_rate))
            in_passage_repres = tf.multiply(in_passage_repres,
                                            (1 - dropout_rate))
            in_choice_repres = tf.multiply(in_choice_repres,
                                           (1 - dropout_rate))

        # if is_training:
        #     in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate))
        #     in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate))
        #     in_choice_repres = tf.nn.dropout(in_choice_repres, (1 - dropout_rate))
        # else:
        #     in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate))
        #     in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate))
        #     in_choice_repres = tf.multiply(in_choice_repres, (1 - dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths,
                                passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]
        choice_mask = tf.sequence_mask(
            self.choice_lengths, choice_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_choice_repres = match_utils.multi_highway_layer(
                    in_choice_repres, input_dim, highway_layer_num)
        # ========Bilateral Matching=====
        # if verbose:
        if matching_option == 7:
            ret_list = gated_trilateral_match(
                in_question_repres,
                in_passage_repres,
                in_choice_repres,
                self.question_lengths,
                self.passage_lengths,
                self.choice_lengths,
                question_mask,
                mask,
                choice_mask,
                self.concat_idx_mat,
                self.split_idx_mat_q,
                self.split_idx_mat_c,
                MP_dim,
                input_dim,
                context_layer_num,
                context_lstm_dim,
                self.is_training,
                dropout_rate,
                with_match_highway,
                aggregation_layer_num,
                aggregation_lstm_dim,
                highway_layer_num,
                with_aggregation_highway,
                with_full_match,
                with_maxpool_match,
                with_attentive_match,
                with_max_attentive_match,
                concat_context=concat_context,
                tied_aggre=tied_aggre,
                rl_matches=rl_matches,
                cond_training=cond_training,
                efficient=efficient,
                tied_match=tied_match,
                construct_memory=reasonet_training,
                debug=verbose)
            all_match_templates, match_dim, gate_input = ret_list[0:3]
            if verbose:
                self.matching_vectors = ret_list[-1]
                self.matching_vectors.append(gate_input)
            if reasonet_training:
                memory = ret_list[3]
                # tiled_memory_mask=ret_list[4]
        else:
            ret_list = match_utils.trilateral_match(
                in_question_repres,
                in_passage_repres,
                in_choice_repres,
                self.question_lengths,
                self.passage_lengths,
                self.choice_lengths,
                question_mask,
                mask,
                choice_mask,
                MP_dim,
                input_dim,
                context_layer_num,
                context_lstm_dim,
                self.is_training,
                dropout_rate,
                with_match_highway,
                aggregation_layer_num,
                aggregation_lstm_dim,
                highway_layer_num,
                with_aggregation_highway,
                with_full_match,
                with_maxpool_match,
                with_attentive_match,
                with_max_attentive_match,
                match_to_passage,
                match_to_question,
                match_to_choice,
                with_no_match,
                debug=verbose,
                matching_option=matching_option)
            match_representation, match_dim = ret_list[0:2]
            if verbose:
                self.matching_vectors = ret_list[-1]

        print('check: match_dim=', match_dim)
        # ========Prediction Layer=========
        with tf.variable_scope('prediction_layer'):
            w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2],
                                  dtype=tf.float32)
            b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)

            if use_options:
                w_1 = tf.get_variable("w_1", [match_dim / 2, 1],
                                      dtype=tf.float32)
                b_1 = tf.get_variable("b_1", [1], dtype=tf.float32)
            else:
                w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes],
                                      dtype=tf.float32)
                b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        if matching_option == 7:

            with tf.variable_scope('rl_decision_gate'):
                if use_options and (not efficient):
                    gate_input = gate_input[::num_options, :]
                w_gate = tf.get_variable('w_gate', [
                    2 * context_layer_num * context_lstm_dim,
                    len(rl_matches)
                ],
                                         dtype=tf.float32)
                b_gate = tf.get_variable('b_gate', [len(rl_matches)],
                                         dtype=tf.float32)
                gate_logits = tf.matmul(gate_input, w_gate) + b_gate

                gate_prob = tf.nn.softmax(
                    gate_logits)  # [batch_size/4, num_match]

                gate_log_prob = tf.nn.log_softmax(
                    gate_logits)  # [batch_size/4, num_match]

            if not reasonet_training:
                sliced_gate_probs = tf.split(gate_prob,
                                             len(rl_matches),
                                             axis=1)
                sliced_gate_log_probs = tf.split(gate_log_prob,
                                                 len(rl_matches),
                                                 axis=1)
                # if use_options:
                #     tile_times=tf.constant([1,num_options])
                # else:
                #     tile_times=tf.constant([1,num_classes])
                self.gate_prob = gate_prob
                self.gate_log_prob = gate_log_prob
                weighted_probs = []
                weighted_log_probs = []
                all_probs = []
                layout = 'question_first' if efficient else 'choice_first'
                for mid, matcher in enumerate(all_match_templates):
                    matcher.add_softmax_pred(w_0,
                                             b_0,
                                             w_1,
                                             b_1,
                                             self.is_training,
                                             dropout_rate,
                                             use_options,
                                             num_options,
                                             layout=layout)
                    all_probs.append(matcher.prob)
                    weighted_probs.append(
                        tf.multiply(matcher.prob, sliced_gate_probs[mid]))
                    weighted_log_probs.append(
                        tf.add(matcher.log_prob, sliced_gate_log_probs[mid]))

                if verbose:
                    self.all_probs = tf.stack(all_probs, axis=0)
                weighted_log_probs = tf.stack(weighted_log_probs, axis=0)
                self.weighted_log_probs = weighted_log_probs
                self.prob = tf.add_n(weighted_probs)
                weighted_probs = tf.stack(weighted_probs, axis=0)
            else:
                self.gate_prob = gate_prob
                self.gate_log_prob = gate_log_prob  # assert efficient
                with tf.variable_scope('reasonet'):
                    reasonet_module = ReasoNetModule(
                        reasonet_steps,
                        num_options,
                        match_dim,
                        memory.aggregation_dim,
                        reasonet_hidden_dim,
                        reasonet_lambda,
                        memory_max_len=passage_len,
                        terminate_mode=reasonet_terminate_mode,
                        keep_first=reasonet_keep_first,
                        logit_combine=reasonet_logit_combine)
                    all_log_probs, all_states = reasonet_module.multiread_matching(
                        all_match_templates, memory)
                    # [num_steps , num_matchers, batch_size/4], [num_steps * num_matchers * batch_size, state_dim]
                    if verbose:
                        self.matching_vectors.append(all_states)
                        for matcher in all_match_templates:
                            self.matching_vectors.append(
                                matcher.aggregation_representation)

                    # if verbose:
                    #     self.matching_vectors+=reasonet_module.test_vectors

                    self.rn_log_probs = all_log_probs
                    num_matcher = len(rl_matches)
                    total_num_gates = num_matcher * reasonet_calculated_steps
                    # all_log_probs=tf.reshape(all_log_probs,[reasonet_calculated_steps, num_matcher,-1]) # [num_steps, num_matcher, batch_size/4]
                    print('gate_log_prob:', gate_log_prob.get_shape())
                    print('all_log_probs:', all_log_probs.get_shape())
                    final_log_probs = tf.reshape(
                        tf.transpose(gate_log_prob) + all_log_probs,
                        [total_num_gates, -1])  #[num_gates, batch_size/4]
                    self.final_log_probs = final_log_probs
                    layout = 'question_first' if efficient else 'choice_first'
                    gate_log_predictions = match_utils.softmax_pred(
                        all_states,
                        w_0,
                        b_0,
                        w_1,
                        b_1,
                        self.is_training,
                        dropout_rate,
                        use_options,
                        num_options,
                        cond_training,
                        layout=layout,
                        num_gates=total_num_gates
                    )  # [num_gates * batch_size/4, num_options]
                    # gate_log_predictions=tf.reshape(gate_log_predictions, [total_num_gates, -1, num_options]) # [num_gates, batch_size/4, num_options]
                    if verbose:
                        for matcher in all_match_templates:
                            matcher.add_softmax_pred(w_0,
                                                     b_0,
                                                     w_1,
                                                     b_1,
                                                     self.is_training,
                                                     dropout_rate,
                                                     use_options,
                                                     num_options,
                                                     layout=layout)
                            self.matching_vectors.append(matcher.log_prob)

                    if verbose:
                        self.all_probs = gate_log_predictions

                    weighted_log_probs = tf.expand_dims(
                        final_log_probs, axis=2
                    ) + gate_log_predictions  # [num_gates, batch_size/4, num_options]
                    self.weighted_log_probs = weighted_log_probs
                    weighted_probs = tf.exp(
                        weighted_log_probs
                    )  # [num_gates, batch_size/4, num_options]
                    self.prob = tf.reduce_sum(
                        weighted_probs, axis=0)  # [batch_size, num_options]
                    print('finished probs')

            if use_options:
                if efficient:
                    gold_matrix = tf.transpose(
                        tf.reshape(
                            self.truth,
                            [num_options, -1]))  # [batch_size, num_options]
                else:
                    gold_matrix = tf.reshape(
                        self.truth,
                        [-1, num_options])  # [batch_size, num_options]
                gold_matrix = tf.cast(gold_matrix, tf.float32)
                self.gold_matrix = gold_matrix
                correct = tf.equal(tf.argmax(self.prob, 1),
                                   tf.argmax(gold_matrix, 1))
            else:
                gold_matrix = tf.one_hot(self.truth,
                                         num_classes,
                                         dtype=tf.float32)
                #         gold_matrix = tf.one_hot(self.truth, num_classes)

                correct = tf.nn.in_top_k(self.prob, self.truth, 1)
            self.correct = correct
            self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
            self.predictions = tf.arg_max(self.prob, 1)

            if rl_training_method == 'soft_voting':
                self.log_prob = tf.reduce_logsumexp(
                    weighted_log_probs, axis=0)  # [batch_size, num_options]
                self.loss = tf.negative(
                    tf.reduce_mean(
                        tf.reduce_sum(tf.multiply(gold_matrix, self.log_prob),
                                      axis=1)))
            elif rl_training_method == 'contrastive' or rl_training_method == 'contrastive_imp':

                reward_matrix = gold_matrix  # [batch_size, num_options]
                baseline = tf.reduce_sum(tf.multiply(weighted_probs,
                                                     reward_matrix),
                                         axis=[0, 2],
                                         keep_dims=True)  # [batch_size]
                if rl_training_method == 'contrastive':
                    normalized_reward = reward_matrix - baseline  # [batch_size, num_options]
                else:
                    normalized_reward = tf.divide(
                        reward_matrix,
                        baseline) - 1  # [batch_size, num_options]
                log_coeffs = tf.multiply(weighted_probs, normalized_reward)
                log_coeffs = tf.stop_gradient(log_coeffs)
                self.log_coeffs = log_coeffs
                self.weighted_log_probs = weighted_log_probs
                self.loss = tf.negative(
                    tf.reduce_mean(
                        tf.reduce_sum(tf.multiply(weighted_log_probs,
                                                  log_coeffs),
                                      axis=[0, 2])))

        else:

            logits = tf.matmul(match_representation, w_0) + b_0
            logits = tf.tanh(logits)

            if cond_training:
                logits = match_utils.apply_dropout(logits, self.is_training,
                                                   dropout_rate)
            elif is_training:
                logits = tf.nn.dropout(logits, (1 - dropout_rate))
            else:
                logits = tf.multiply(logits, (1 - dropout_rate))
            logits = tf.matmul(logits, w_1) + b_1

            self.final_logits = logits
            if use_options:
                if efficient:
                    logits = tf.transpose(tf.reshape(logits,
                                                     [num_options, -1]))
                    gold_matrix = tf.transpose(
                        tf.reshape(self.truth, [num_options, -1]))
                else:
                    logits = tf.reshape(logits, [-1, num_options])
                    gold_matrix = tf.reshape(self.truth, [-1, num_options])

                self.prob = tf.nn.softmax(logits)

                #         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example')
                #         self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy')

                # gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
                #         gold_matrix = tf.one_hot(self.truth, num_classes)
                self.loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, labels=gold_matrix))

                # correct = tf.nn.in_top_k(logits, self.truth, 1)
                # self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
                correct = tf.equal(tf.argmax(logits, 1),
                                   tf.argmax(gold_matrix, 1))
                self.gold_matrix = gold_matrix
                self.correct = correct

            else:
                self.prob = tf.nn.softmax(logits)

                #         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example')
                #         self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy')

                gold_matrix = tf.one_hot(self.truth,
                                         num_classes,
                                         dtype=tf.float32)
                #         gold_matrix = tf.one_hot(self.truth, num_classes)
                self.loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, labels=gold_matrix))

                correct = tf.nn.in_top_k(logits, self.truth, 1)
                self.correct = correct
            self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
            self.predictions = tf.arg_max(self.prob, 1)

        if optimize_type == 'adadelta':
            clipper = 50
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              clipper)
            self.train_op = optimizer.apply_gradients(list(zip(grads, tvars)))
        elif optimize_type == 'sgd':
            self.global_step = tf.Variable(
                0, name='global_step',
                trainable=False)  # Create a variable to track the global step.
            min_lr = 0.000001
            self._lr_rate = tf.maximum(
                min_lr,
                tf.train.exponential_decay(learning_rate, self.global_step,
                                           30000, 0.98))
            self.train_op = tf.train.GradientDescentOptimizer(
                learning_rate=self._lr_rate).minimize(self.loss)
        elif optimize_type == 'ema':
            tvars = tf.trainable_variables()
            train_op = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(self.loss)
            # Create an ExponentialMovingAverage object
            ema = tf.train.ExponentialMovingAverage(decay=0.9999)
            # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1.
            maintain_averages_op = ema.apply(tvars)
            # Create an op that will update the moving averages after each training
            # step.  This is what we will use in place of the usual training op.
            with tf.control_dependencies([train_op]):
                self.train_op = tf.group(maintain_averages_op)
        elif optimize_type == 'adam':
            clipper = 50
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              clipper)
            self.train_op = optimizer.apply_gradients(list(zip(grads, tvars)))

        extra_train_ops = []
        train_ops = [self.train_op] + extra_train_ops
        self.train_op = tf.group(*train_ops)

        with tf.name_scope('summary'):
            self.loss_summary = tf.summary.scalar('loss', self.loss)
            self.acc_summary = tf.summary.scalar('accuracy', self.eval_correct)
예제 #7
0
    def __init__(self,
                 num_classes,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None,
                 dropout_rate=0.5,
                 learning_rate=0.001,
                 optimize_type='adam',
                 lambda_l2=1e-5,
                 with_word=True,
                 with_char=True,
                 with_POS=True,
                 with_NER=True,
                 char_lstm_dim=20,
                 context_lstm_dim=100,
                 aggregation_lstm_dim=200,
                 is_training=True,
                 filter_layer_threshold=0.2,
                 MP_dim=50,
                 context_layer_num=1,
                 aggregation_layer_num=1,
                 fix_word_vec=False,
                 with_filter_layer=True,
                 with_highway=False,
                 with_lex_features=False,
                 lex_dim=100,
                 word_level_MP_dim=-1,
                 sep_endpoint=False,
                 end_model_combine=False,
                 with_match_highway=False,
                 with_aggregation_highway=False,
                 highway_layer_num=1,
                 with_lex_decomposition=False,
                 lex_decompsition_dim=-1,
                 with_left_match=True,
                 with_right_match=True,
                 with_full_match=True,
                 with_maxpool_match=True,
                 with_attentive_match=True,
                 with_max_attentive_match=True,
                 with_dep=True):

        # ======word representation layer======
        in_question_repres = []  # premise
        in_question_dep_cons = []  # premise dependency connections
        in_passage_repres = []  # hypothesis
        in_passage_dep_cons = []  # hypothesis dependency connections
        self.question_lengths = tf.placeholder(tf.int32, [None])
        self.passage_lengths = tf.placeholder(tf.int32, [None])
        self.truth = tf.placeholder(tf.int32, [None])  # [batch_size]
        input_dim = 0
        # word embedding
        if with_word and word_vocab is not None:
            self.in_question_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(word_vocab.word_vecs),
                    dtype=tf.float32)
            #
            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            #print (in_question_word_repres)
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        if with_dep:
            self.in_question_dependency = tf.placeholder(
                tf.float32, [None, None, word_vocab.parser.typesize
                             ])  # [batch_size, question_len, dep_dim]
            self.in_passage_dependency = tf.placeholder(
                tf.float32, [None, None, word_vocab.parser.typesize
                             ])  # [batch_size, passage_len, dep_dim]
            self.in_question_dep_con = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_dep_con = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #dependency representation is the same as data input
            in_question_dep_repres = self.in_question_dependency
            in_passage_dep_repres = self.in_passage_dependency

            in_question_repres.append(in_question_dep_repres)
            in_passage_repres.append(in_passage_dep_repres)

            input_dim += word_vocab.parser.typesize  # dependency_dim
            # embedding dependency later here

            #get dependency connections, do smth here? otherwise just pass self.in_question_dep_con to matching function
            in_question_dep_cons = self.in_question_dep_con
            in_passage_dep_cons = self.in_passage_dep_con

        #if with_image:
        #    self.

        if with_POS and POS_vocab is not None:
            self.in_question_POSs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_POSs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32)
            self.POS_embedding = tf.get_variable("POS_embedding",
                                                 initializer=tf.constant(
                                                     POS_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_question_POSs)  # [batch_size, question_len, POS_dim]
            in_passage_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_passage_POSs)  # [batch_size, passage_len, POS_dim]
            in_question_repres.append(in_question_POS_repres)
            in_passage_repres.append(in_passage_POS_repres)

            input_shape = tf.shape(self.in_question_POSs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_POSs)
            passage_len = input_shape[1]
            input_dim += POS_vocab.word_dim

        if with_NER and NER_vocab is not None:
            self.in_question_NERs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_NERs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32)
            self.NER_embedding = tf.get_variable("NER_embedding",
                                                 initializer=tf.constant(
                                                     NER_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_question_NERs)  # [batch_size, question_len, NER_dim]
            in_passage_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_passage_NERs)  # [batch_size, passage_len, NER_dim]
            in_question_repres.append(in_question_NER_repres)
            in_passage_repres.append(in_passage_NER_repres)

            input_shape = tf.shape(self.in_question_NERs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_NERs)
            passage_len = input_shape[1]
            input_dim += NER_vocab.word_dim

        if with_char and char_vocab is not None:
            self.question_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.passage_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.in_question_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, question_len, q_char_len]
            self.in_passage_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, passage_len, p_char_len]
            input_shape = tf.shape(self.in_question_chars)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = char_vocab.word_dim
            #             self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)
            self.char_embedding = tf.get_variable("char_embedding",
                                                  initializer=tf.constant(
                                                      char_vocab.word_vecs),
                                                  dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_question_chars
            )  # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(
                in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths,
                                               [-1])
            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(char_lstm_dim)
                # dropout
                if is_training:
                    char_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                        char_lstm_cell, output_keep_prob=(1 - dropout_rate))
                char_lstm_cell = tf.nn.rnn_cell.MultiRNNCell([char_lstm_cell])

                # question_representation
                question_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_question_char_repres,
                    sequence_length=question_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                question_char_outputs = question_char_outputs[:, -1, :]
                question_char_outputs = tf.reshape(
                    question_char_outputs,
                    [batch_size, question_len, char_lstm_dim])

                tf.get_variable_scope().reuse_variables()
                # passage representation
                passage_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = passage_char_outputs[:, -1, :]
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, char_lstm_dim])

            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)

            input_dim += char_lstm_dim
        #print('\n\n\n')
        #print (in_question_repres)
        #print('\n\n\n')
        in_question_repres = tf.concat(
            2, in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            2, in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - dropout_rate))
        else:
            in_question_repres = tf.mul(in_question_repres, (1 - dropout_rate))
            in_passage_repres = tf.mul(in_passage_repres, (1 - dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths,
                                passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, highway_layer_num)

        # ========Bilateral Matching=====
        (match_representation, match_dim) = match_utils.bilateral_match_func2(
            in_question_repres,
            in_passage_repres,
            in_question_dep_cons,
            in_passage_dep_cons,
            self.question_lengths,
            self.passage_lengths,
            question_mask,
            mask,
            MP_dim,
            input_dim,
            with_filter_layer,
            context_layer_num,
            context_lstm_dim,
            is_training,
            dropout_rate,
            with_match_highway,
            aggregation_layer_num,
            aggregation_lstm_dim,
            highway_layer_num,
            with_aggregation_highway,
            with_lex_decomposition,
            lex_decompsition_dim,
            with_full_match,
            with_maxpool_match,
            with_attentive_match,
            with_max_attentive_match,
            with_left_match,
            with_right_match,
            with_dep=with_dep)

        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.tanh(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - dropout_rate))
        else:
            logits = tf.mul(logits, (1 - dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)

        #         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example')
        #         self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy')

        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        #         gold_matrix = tf.one_hot(self.truth, num_classes)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits, gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        self.predictions = tf.arg_max(self.prob, 1)

        if optimize_type == 'adadelta':
            clipper = 50
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))
        elif optimize_type == 'sgd':
            self.global_step = tf.Variable(
                0, name='global_step',
                trainable=False)  # Create a variable to track the global step.
            min_lr = 0.000001
            self._lr_rate = tf.maximum(
                min_lr,
                tf.train.exponential_decay(learning_rate, self.global_step,
                                           30000, 0.98))
            self.train_op = tf.train.GradientDescentOptimizer(
                learning_rate=self._lr_rate).minimize(self.loss)
        elif optimize_type == 'ema':
            tvars = tf.trainable_variables()
            train_op = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(self.loss)
            # Create an ExponentialMovingAverage object
            ema = tf.train.ExponentialMovingAverage(decay=0.9999)
            # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1.
            maintain_averages_op = ema.apply(tvars)
            # Create an op that will update the moving averages after each training
            # step.  This is what we will use in place of the usual training op.
            with tf.control_dependencies([train_op]):
                self.train_op = tf.group(maintain_averages_op)
        elif optimize_type == 'adam':
            clipper = 50
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        extra_train_ops = []
        train_ops = [self.train_op] + extra_train_ops
        self.train_op = tf.group(*train_ops)
예제 #8
0
    def encode(self, is_training=True):
        options = self.options

        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if options.with_word and self.word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.variable_scope("embedding"), tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(self.word_vocab.word_vecs),
                    dtype=tf.float32)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += self.word_vocab.word_dim

        if options.with_char and self.char_vocab is not None:
            input_shape = tf.shape(self.in_question_chars)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = self.char_vocab.word_dim
            self.char_embedding = tf.get_variable(
                "char_embedding",
                initializer=tf.constant(self.char_vocab.word_vecs),
                dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_question_chars
            )  # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(
                in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths,
                                               [-1])
            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(
                    options.char_lstm_dim)
                # dropout
                if is_training:
                    char_lstm_cell = tf.contrib.rnn.DropoutWrapper(
                        char_lstm_cell,
                        output_keep_prob=(1 - options.dropout_rate))
                char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell])

                # question_representation
                question_char_outputs = tf.nn.dynamic_rnn(
                    char_lstm_cell,
                    in_question_char_repres,
                    sequence_length=question_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                question_char_outputs = question_char_outputs[:, -1, :]
                question_char_outputs = tf.reshape(
                    question_char_outputs,
                    [batch_size, question_len, options.char_lstm_dim])

                tf.get_variable_scope().reuse_variables()
                # passage representation
                passage_char_outputs = tf.nn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = passage_char_outputs[:, -1, :]
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, options.char_lstm_dim])

            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)
            input_dim += options.char_lstm_dim

        if options.with_POS and self.POS_vocab is not None:
            self.POS_embedding = tf.get_variable("POS_embedding",
                                                 initializer=tf.constant(
                                                     self.POS_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_question_POSs)  # [batch_size, question_len, POS_dim]
            in_passage_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_passage_POSs)  # [batch_size, passage_len, POS_dim]
            in_question_repres.append(in_question_POS_repres)
            in_passage_repres.append(in_passage_POS_repres)

            input_shape = tf.shape(self.in_question_POSs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_POSs)
            passage_len = input_shape[1]
            input_dim += self.POS_vocab.word_dim

        if options.with_NER and self.NER_vocab is not None:
            self.NER_embedding = tf.get_variable("NER_embedding",
                                                 initializer=tf.constant(
                                                     self.NER_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_question_NERs)  # [batch_size, question_len, NER_dim]
            in_passage_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_passage_NERs)  # [batch_size, passage_len, NER_dim]
            in_question_repres.append(in_question_NER_repres)
            in_passage_repres.append(in_passage_NER_repres)

            input_shape = tf.shape(self.in_question_NERs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_NERs)
            passage_len = input_shape[1]
            input_dim += self.NER_vocab.word_dim

        in_question_repres = tf.concat(in_question_repres,
                                       2)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(in_passage_repres,
                                      2)  # [batch_size, passage_len, dim]

        if options.compress_input:  # compress input word vector into smaller vectors
            w_compress = tf.get_variable(
                "w_compress_input", [input_dim, options.compress_input_dim],
                dtype=tf.float32)
            b_compress = tf.get_variable("b_compress_input",
                                         [options.compress_input_dim],
                                         dtype=tf.float32)

            in_question_repres = tf.reshape(in_question_repres,
                                            [-1, input_dim])
            in_question_repres = tf.matmul(in_question_repres,
                                           w_compress) + b_compress
            in_question_repres = tf.tanh(in_question_repres)
            in_question_repres = tf.reshape(
                in_question_repres,
                [batch_size, question_len, options.compress_input_dim])

            in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim])
            in_passage_repres = tf.matmul(in_passage_repres,
                                          w_compress) + b_compress
            in_passage_repres = tf.tanh(in_passage_repres)
            in_passage_repres = tf.reshape(
                in_passage_repres,
                [batch_size, passage_len, options.compress_input_dim])
            input_dim = options.compress_input_dim

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))
        else:
            in_question_repres = tf.multiply(in_question_repres,
                                             (1 - options.dropout_rate))
            in_passage_repres = tf.multiply(in_passage_repres,
                                            (1 - options.dropout_rate))

        passage_mask = tf.sequence_mask(
            self.passage_lengths, passage_len,
            dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        # ======Filter layer======
        cosine_matrix = match_utils.cal_relevancy_matrix(
            in_question_repres, in_passage_repres)
        cosine_matrix = match_utils.mask_relevancy_matrix(
            cosine_matrix, question_mask, passage_mask)
        #         relevancy_matrix = tf.select(tf.greater(cosine_matrix,
        #                                     tf.scalar_mul(filter_layer_threshold, tf.ones_like(cosine_matrix, dtype=tf.float32))),
        #                                     cosine_matrix, tf.zeros_like(cosine_matrix, dtype=tf.float32)) # [batch_size, passage_len, question_len]
        raw_in_passage_repres = in_passage_repres
        if options.with_filter_layer:
            relevancy_matrix = cosine_matrix  # [batch_size, passage_len, question_len]
            relevancy_degrees = tf.reduce_max(
                relevancy_matrix, axis=2)  # [batch_size, passage_len]
            relevancy_degrees = tf.expand_dims(
                relevancy_degrees, axis=-1)  # [batch_size, passage_len, 'x']
            in_passage_repres = tf.multiply(in_passage_repres,
                                            relevancy_degrees)

        # =======Context Representation Layer & Multi-Perspective matching layer=====
        all_question_aware_representatins = []
        question_aware_dim = 0
        if options.with_word_match:
            with tf.variable_scope('word_level_matching'):
                (word_match_vectors,
                 word_match_dim) = match_utils.match_passage_with_question(
                     raw_in_passage_repres,
                     None,
                     passage_mask,
                     in_question_repres,
                     None,
                     question_mask,
                     input_dim,
                     with_full_matching=False,
                     with_attentive_matching=options.with_attentive_matching,
                     with_max_attentive_matching=options.
                     with_max_attentive_matching,
                     with_maxpooling_matching=options.with_maxpooling_matching,
                     with_local_attentive_matching=options.
                     with_local_attentive_matching,
                     win_size=options.win_size,
                     with_forward_match=True,
                     with_backward_match=False,
                     match_options=options)
                all_question_aware_representatins.extend(word_match_vectors)
                question_aware_dim += word_match_dim
        # lex decomposition
        if options.with_lex_decomposition:
            lex_decomposition = match_utils.cal_linear_decomposition_representation(
                raw_in_passage_repres, self.passage_lengths, cosine_matrix,
                is_training, options.lex_decompsition_dim,
                options.dropout_rate)
            all_question_aware_representatins.append(lex_decomposition)
            if options.lex_decompsition_dim == -1:
                question_aware_dim += 2 * input_dim
            else:
                question_aware_dim += 2 * options.lex_decompsition_dim

        if options.with_question_passage_word_feature:
            all_question_aware_representatins.append(raw_in_passage_repres)

            att_question_representation = match_utils.calculate_cosine_weighted_question_representation(
                in_question_repres, cosine_matrix)
            all_question_aware_representatins.append(
                att_question_representation)
            question_aware_dim += 2 * input_dim

        # sequential context matching
        question_forward = None
        question_backward = None
        passage_forward = None
        passage_backward = None
        if options.with_sequential_match:
            with tf.variable_scope('context_MP_matching'):
                cur_in_question_repres = in_question_repres
                cur_in_passage_repres = in_passage_repres
                for i in xrange(options.context_layer_num):
                    with tf.variable_scope('layer-{}'.format(i)):
                        with tf.variable_scope('context_represent'):
                            # parameters
                            context_lstm_cell_fw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            context_lstm_cell_bw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            if is_training:
                                context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_fw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))
                                context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_bw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))

                            # question representation
                            ((question_context_representation_fw,
                              question_context_representation_bw),
                             (question_forward, question_backward
                              )) = tf.nn.bidirectional_dynamic_rnn(
                                  context_lstm_cell_fw,
                                  context_lstm_cell_bw,
                                  cur_in_question_repres,
                                  dtype=tf.float32,
                                  sequence_length=self.question_lengths
                              )  # [batch_size, question_len, context_lstm_dim]
                            cur_in_question_repres = tf.concat([
                                question_context_representation_fw,
                                question_context_representation_bw
                            ], 2)

                            # passage representation
                            tf.get_variable_scope().reuse_variables()
                            ((passage_context_representation_fw,
                              passage_context_representation_bw),
                             (passage_forward, passage_backward
                              )) = tf.nn.bidirectional_dynamic_rnn(
                                  context_lstm_cell_fw,
                                  context_lstm_cell_bw,
                                  cur_in_passage_repres,
                                  dtype=tf.float32,
                                  sequence_length=self.passage_lengths
                              )  # [batch_size, passage_len, context_lstm_dim]
                            cur_in_passage_repres = tf.concat([
                                passage_context_representation_fw,
                                passage_context_representation_bw
                            ], 2)

                        # Multi-perspective matching
                        with tf.variable_scope('MP_matching'):
                            (matching_vectors, matching_dim
                             ) = match_utils.match_passage_with_question(
                                 passage_context_representation_fw,
                                 passage_context_representation_bw,
                                 passage_mask,
                                 question_context_representation_fw,
                                 question_context_representation_bw,
                                 question_mask,
                                 options.context_lstm_dim,
                                 with_full_matching=options.with_full_matching,
                                 with_attentive_matching=options.
                                 with_attentive_matching,
                                 with_max_attentive_matching=options.
                                 with_max_attentive_matching,
                                 with_maxpooling_matching=options.
                                 with_maxpooling_matching,
                                 with_local_attentive_matching=options.
                                 with_local_attentive_matching,
                                 win_size=options.win_size,
                                 with_forward_match=options.with_forward_match,
                                 with_backward_match=options.
                                 with_backward_match,
                                 match_options=options)
                            all_question_aware_representatins.extend(
                                matching_vectors)
                            question_aware_dim += matching_dim

        all_question_aware_representatins = tf.concat(
            all_question_aware_representatins,
            2)  # [batch_size, passage_len, dim]

        if is_training:
            all_question_aware_representatins = tf.nn.dropout(
                all_question_aware_representatins, (1 - options.dropout_rate))
        else:
            all_question_aware_representatins = tf.multiply(
                all_question_aware_representatins, (1 - options.dropout_rate))

        # ======Highway layer======
        if options.with_match_highway:
            with tf.variable_scope("matching_highway"):
                all_question_aware_representatins = match_utils.multi_highway_layer(
                    all_question_aware_representatins, question_aware_dim,
                    options.highway_layer_num)

        #========Aggregation Layer======
        if not options.with_aggregation:
            aggregation_representation = all_question_aware_representatins
            aggregation_dim = question_aware_dim
        else:
            aggregation_representation = []
            aggregation_dim = 0
            aggregation_input = all_question_aware_representatins
            with tf.variable_scope('aggregation_layer'):
                for i in xrange(options.aggregation_layer_num):
                    with tf.variable_scope('layer-{}'.format(i)):
                        aggregation_lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(
                            options.aggregation_lstm_dim)
                        aggregation_lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(
                            options.aggregation_lstm_dim)
                        if is_training:
                            aggregation_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(
                                aggregation_lstm_cell_fw,
                                output_keep_prob=(1 - options.dropout_rate))
                            aggregation_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(
                                aggregation_lstm_cell_bw,
                                output_keep_prob=(1 - options.dropout_rate))
                        aggregation_lstm_cell_fw = tf.contrib.rnn.MultiRNNCell(
                            [aggregation_lstm_cell_fw])
                        aggregation_lstm_cell_bw = tf.contrib.rnn.MultiRNNCell(
                            [aggregation_lstm_cell_bw])

                        cur_aggregation_representation, _ = rnn.bidirectional_dynamic_rnn(
                            aggregation_lstm_cell_fw,
                            aggregation_lstm_cell_bw,
                            aggregation_input,
                            dtype=tf.float32,
                            sequence_length=self.passage_lengths)
                        cur_aggregation_representation = tf.concat(
                            cur_aggregation_representation, 2
                        )  # [batch_size, passage_len, 2*aggregation_lstm_dim]
                        aggregation_representation.append(
                            cur_aggregation_representation)
                        aggregation_dim += 2 * options.aggregation_lstm_dim
                        aggregation_input = cur_aggregation_representation

            aggregation_representation = tf.concat(aggregation_representation,
                                                   2)
            aggregation_representation = tf.concat([
                aggregation_representation, all_question_aware_representatins
            ], 2)
            aggregation_dim += question_aware_dim

        # ======Highway layer======
        if options.with_aggregation_highway:
            with tf.variable_scope("aggregation_highway"):
                aggregation_representation = match_utils.multi_highway_layer(
                    aggregation_representation, aggregation_dim,
                    options.highway_layer_num)

        #========output Layer=========
        encode_size = aggregation_dim + input_dim
        encode_hiddens = tf.concat(
            [aggregation_representation, in_passage_repres],
            2)  # [batch_size, passage_len, enc_size]
        encode_hiddens = encode_hiddens * tf.expand_dims(passage_mask, axis=-1)

        # initial state for the LSTM decoder
        #'''
        with tf.variable_scope('initial_state_for_decoder'):
            # Define weights and biases to reduce the cell and reduce the state
            w_reduce_c = tf.get_variable(
                'w_reduce_c',
                [4 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            w_reduce_h = tf.get_variable(
                'w_reduce_h',
                [4 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            bias_reduce_c = tf.get_variable('bias_reduce_c',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)
            bias_reduce_h = tf.get_variable('bias_reduce_h',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)

            old_c = tf.concat(values=[
                question_forward.c, question_backward.c, passage_forward.c,
                passage_backward.c
            ],
                              axis=1)
            old_h = tf.concat(values=[
                question_forward.h, question_backward.h, passage_forward.h,
                passage_backward.h
            ],
                              axis=1)
            new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c)
            new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h)

            init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
        '''
        new_c = tf.zeros([batch_size, options.gen_hidden_size])
        new_h = tf.zeros([batch_size, options.gen_hidden_size])
        init_state = LSTMStateTuple(new_c, new_h)
        '''
        return (encode_size, encode_hiddens, init_state)
예제 #9
0
    def __init__(self,
                 word_vocab=None,
                 edge_label_vocab=None,
                 char_vocab=None,
                 is_training=True,
                 options=None):
        assert options != None

        self.passage_nodes_size = tf.placeholder(tf.int32,
                                                 [None])  # [batch_size]
        self.passage_nodes = tf.placeholder(
            tf.int32, [None, None])  # [batch_size, passage_nodes_size_max]
        if options.with_char:
            self.passage_nodes_chars_size = tf.placeholder(
                tf.int32, [None, None])
            self.passage_nodes_chars = tf.placeholder(tf.int32,
                                                      [None, None, None])

        # [batch_size, passage_nodes_size_max, passage_neighbors_size_max]
        self.passage_in_neighbor_indices = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_in_neighbor_hidden_indices = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_in_neighbor_edges = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_in_neighbor_mask = tf.placeholder(tf.float32,
                                                       [None, None, None])

        # shapes
        input_shape = tf.shape(self.passage_in_neighbor_indices)
        batch_size = input_shape[0]
        passage_nodes_size_max = input_shape[1]
        passage_in_neighbors_size_max = input_shape[2]
        if options.with_char:
            passage_nodes_chars_size_max = tf.shape(
                self.passage_nodes_chars)[2]

        # masks
        # [batch_size, passage_nodes_size_max]
        self.passage_nodes_mask = tf.sequence_mask(self.passage_nodes_size,
                                                   passage_nodes_size_max,
                                                   dtype=tf.float32)

        # embeddings
        word_vec_trainable = True
        cur_device = '/gpu:0'
        if options.fix_word_vec:
            word_vec_trainable = False
            cur_device = '/cpu:0'
        with tf.device(cur_device):
            self.word_embedding = tf.get_variable("word_embedding",
                                                  trainable=word_vec_trainable,
                                                  initializer=tf.constant(
                                                      word_vocab.word_vecs),
                                                  dtype=tf.float32)

        self.edge_embedding = tf.get_variable("edge_embedding",
                                              initializer=tf.constant(
                                                  edge_label_vocab.word_vecs),
                                              dtype=tf.float32)

        word_dim = word_vocab.word_dim
        edge_dim = edge_label_vocab.word_dim

        if options.with_char:
            self.char_embedding = tf.get_variable("char_embedding",
                                                  initializer=tf.constant(
                                                      char_vocab.word_vecs),
                                                  dtype=tf.float32)
            char_dim = char_vocab.word_dim

        # word representation for nodes, where each node only includes one word
        # [batch_size, passage_nodes_size_max, word_dim]
        passage_node_representation = tf.nn.embedding_lookup(
            self.word_embedding, self.passage_nodes)

        if options.with_char:
            # [batch_size, passage_nodes_size_max, passage_nodes_chars_size_max, char_dim]
            passage_nodes_chars_representation = tf.nn.embedding_lookup(
                self.char_embedding, self.passage_nodes_chars)
            passage_nodes_chars_representation = tf.reshape(
                passage_nodes_chars_representation,
                shape=[
                    batch_size * passage_nodes_size_max,
                    passage_nodes_chars_size_max, char_dim
                ])
            passage_nodes_chars_size = tf.reshape(
                self.passage_nodes_chars_size,
                [batch_size * passage_nodes_size_max])
            with tf.variable_scope('node_char_lstm'):
                node_char_lstm_cell = tf.contrib.rnn.LSTMCell(
                    options.char_lstm_dim)
                node_char_lstm_cell = tf.contrib.rnn.MultiRNNCell(
                    [node_char_lstm_cell])
                # [batch_size*node_num, char_num, char_lstm_dim]
                node_char_outputs = tf.nn.dynamic_rnn(
                    node_char_lstm_cell,
                    passage_nodes_chars_representation,
                    sequence_length=passage_nodes_chars_size,
                    dtype=tf.float32)[0]
                node_char_outputs = collect_final_step_lstm(
                    node_char_outputs, passage_nodes_chars_size - 1)
                # [batch_size, node_num, char_lstm_dim]
                node_char_outputs = tf.reshape(node_char_outputs, [
                    batch_size, passage_nodes_size_max, options.char_lstm_dim
                ])

        if options.with_char:
            input_dim = word_dim + options.char_lstm_dim
            passage_node_representation = tf.concat(
                [passage_node_representation, node_char_outputs], 2)
        else:
            input_dim = word_dim
            passage_node_representation = passage_node_representation

        # apply the mask
        passage_node_representation = passage_node_representation * tf.expand_dims(
            self.passage_nodes_mask, axis=-1)

        if options.compress_input:  # compress input word vector into smaller vectors
            w_compress = tf.get_variable(
                "w_compress_input", [input_dim, options.compress_input_dim],
                dtype=tf.float32)
            b_compress = tf.get_variable("b_compress_input",
                                         [options.compress_input_dim],
                                         dtype=tf.float32)

            passage_node_representation = tf.reshape(
                passage_node_representation, [-1, input_dim])
            passage_node_representation = tf.matmul(
                passage_node_representation, w_compress) + b_compress
            passage_node_representation = tf.tanh(passage_node_representation)
            passage_node_representation = tf.reshape(passage_node_representation, \
                    [batch_size, passage_nodes_size_max, options.compress_input_dim])
            input_dim = options.compress_input_dim

        if is_training:
            passage_node_representation = tf.nn.dropout(
                passage_node_representation, (1 - options.dropout_rate))

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                passage_node_representation = match_utils.multi_highway_layer(
                    passage_node_representation, input_dim,
                    options.highway_layer_num)

        # =========== in neighbor
        # [batch_size, passage_len, passage_neighbors_size_max, edge_dim]
        passage_in_neighbor_edge_representations = tf.nn.embedding_lookup(
            self.edge_embedding, self.passage_in_neighbor_edges)
        # [batch_size, passage_len, passage_neighbors_size_max, node_dim]
        passage_in_neighbor_node_representations = collect_neighbor_node_representations(
            passage_node_representation, self.passage_in_neighbor_indices)

        passage_in_neighbor_representations = tf.concat( \
                [passage_in_neighbor_node_representations, passage_in_neighbor_edge_representations], 3)
        passage_in_neighbor_representations = tf.multiply(
            passage_in_neighbor_representations,
            tf.expand_dims(self.passage_in_neighbor_mask, axis=-1))
        # [batch_size, passage_len, node_dim + edge_dim]
        passage_in_neighbor_representations = tf.reduce_sum(
            passage_in_neighbor_representations, axis=2)

        # =====transform neighbor_representations
        dag_hidden_dim = options.neighbor_vector_dim
        w_trans = tf.get_variable("w_trans",
                                  [input_dim + edge_dim, dag_hidden_dim],
                                  dtype=tf.float32)
        b_trans = tf.get_variable("b_trans", [dag_hidden_dim],
                                  dtype=tf.float32)

        passage_in_neighbor_representations = tf.reshape(
            passage_in_neighbor_representations, [-1, input_dim + edge_dim])
        passage_in_neighbor_representations = tf.matmul(
            passage_in_neighbor_representations, w_trans) + b_trans
        passage_in_neighbor_representations = tf.tanh(
            passage_in_neighbor_representations)

        passage_in_neighbor_representations = tf.reshape(
            passage_in_neighbor_representations,
            [batch_size, passage_nodes_size_max, dag_hidden_dim])
        passage_in_neighbor_representations = tf.multiply(
            passage_in_neighbor_representations,
            tf.expand_dims(self.passage_nodes_mask, axis=-1))

        with tf.variable_scope('gated_operations'):
            w_in_ingate = tf.get_variable("w_in_ingate",
                                          [dag_hidden_dim, dag_hidden_dim],
                                          dtype=tf.float32)
            u_in_ingate = tf.get_variable("u_in_ingate",
                                          [dag_hidden_dim, dag_hidden_dim],
                                          dtype=tf.float32)
            b_ingate = tf.get_variable("b_in_ingate", [dag_hidden_dim],
                                       dtype=tf.float32)

            w_in_forgetgate = tf.get_variable("w_in_forgetgate",
                                              [dag_hidden_dim, dag_hidden_dim],
                                              dtype=tf.float32)
            u_in_forgetgate = tf.get_variable("u_in_forgetgate",
                                              [dag_hidden_dim, dag_hidden_dim],
                                              dtype=tf.float32)
            b_forgetgate = tf.get_variable("b_in_forgetgate", [dag_hidden_dim],
                                           dtype=tf.float32)

            w_in_outgate = tf.get_variable("w_in_outgate",
                                           [dag_hidden_dim, dag_hidden_dim],
                                           dtype=tf.float32)
            u_in_outgate = tf.get_variable("u_in_outgate",
                                           [dag_hidden_dim, dag_hidden_dim],
                                           dtype=tf.float32)
            b_outgate = tf.get_variable("b_in_outgate", [dag_hidden_dim],
                                        dtype=tf.float32)

            w_in_cell = tf.get_variable("w_in_cell",
                                        [dag_hidden_dim, dag_hidden_dim],
                                        dtype=tf.float32)
            u_in_cell = tf.get_variable("u_in_cell",
                                        [dag_hidden_dim, dag_hidden_dim],
                                        dtype=tf.float32)
            b_cell = tf.get_variable("b_in_cell", [dag_hidden_dim],
                                     dtype=tf.float32)

            # assume each node has a neighbor vector, and it is None at the beginning
            passage_node_hidden = tf.zeros([batch_size, 1, dag_hidden_dim])
            passage_node_cell = tf.zeros([batch_size, 1, dag_hidden_dim])

            idx_var = tf.constant(0)  #tf.Variable(0,trainable=False)

            # body function
            def _recurrence(passage_node_hidden, passage_node_cell, idx_var):
                # [batch_size, neighbor_size]
                prev_mask = tf.gather(self.passage_in_neighbor_mask,
                                      idx_var,
                                      axis=1)
                # [batch_size]
                node_mask = tf.gather(self.passage_nodes_mask, idx_var, axis=1)
                # [batch_size, neighbor_size]
                prev_idx = tf.gather(self.passage_in_neighbor_hidden_indices,
                                     idx_var,
                                     axis=1)
                # [batch_size, input_dim]
                prev_input = tf.gather(passage_in_neighbor_representations,
                                       idx_var,
                                       axis=1)

                # [batch_size, neighbor_size, dag_hidden_dim]
                prev_hidden = collect_neighbor_node_representations_2D(
                    passage_node_hidden, prev_idx)
                prev_hidden = tf.multiply(prev_hidden,
                                          tf.expand_dims(prev_mask, axis=-1))
                # [batch_size, dag_hidden_dim]
                prev_hidden = tf.reduce_sum(prev_hidden, axis=1)
                prev_hidden = tf.multiply(prev_hidden,
                                          tf.expand_dims(node_mask, axis=-1))

                # [batch_size, neighbor_size, dag_hidden_dim]
                prev_cell = collect_neighbor_node_representations_2D(
                    passage_node_cell, prev_idx)
                prev_cell = tf.multiply(prev_cell,
                                        tf.expand_dims(prev_mask, axis=-1))
                # [batch_size, dag_hidden_dim]
                prev_cell = tf.reduce_sum(prev_cell, axis=1)
                prev_cell = tf.multiply(prev_cell,
                                        tf.expand_dims(node_mask, axis=-1))

                ## ig
                passage_edge_ingate = tf.sigmoid(
                    tf.matmul(prev_input, w_in_ingate) +
                    tf.matmul(prev_hidden, u_in_ingate) + b_ingate)
                ## fg
                passage_edge_forgetgate = tf.sigmoid(
                    tf.matmul(prev_input, w_in_forgetgate) +
                    tf.matmul(prev_hidden, u_in_forgetgate) + b_forgetgate)
                ## og
                passage_edge_outgate = tf.sigmoid(
                    tf.matmul(prev_input, w_in_outgate) +
                    tf.matmul(prev_hidden, u_in_outgate) + b_outgate)
                ## input
                passage_edge_input = tf.tanh(
                    tf.matmul(prev_input, w_in_cell) +
                    tf.matmul(prev_hidden, u_in_cell) + b_cell)

                # calculating new cell and hidden
                passage_edge_cell = passage_edge_forgetgate * prev_cell + passage_edge_ingate * passage_edge_input
                passage_edge_hidden = passage_edge_outgate * tf.tanh(
                    passage_edge_cell)
                # node mask
                passage_edge_cell = tf.multiply(
                    passage_edge_cell, tf.expand_dims(node_mask, axis=-1))
                passage_edge_hidden = tf.multiply(
                    passage_edge_hidden, tf.expand_dims(node_mask, axis=-1))
                # [batch_size, 1, dag_hidden_dim]
                passage_edge_cell = tf.expand_dims(passage_edge_cell, axis=1)
                passage_edge_hidden = tf.expand_dims(passage_edge_hidden,
                                                     axis=1)
                # concatenating new staff
                passage_node_hidden = tf.concat(
                    [passage_node_hidden, passage_edge_hidden], axis=1)
                passage_node_cell = tf.concat(
                    [passage_node_cell, passage_edge_cell], axis=1)

                idx_var = tf.add(idx_var, 1)
                return passage_node_hidden, passage_node_cell, idx_var

            loop_condition = lambda a1, b1, idx_var: tf.less(
                idx_var, passage_nodes_size_max)
            loop_vars = [passage_node_hidden, passage_node_cell, idx_var]
            passage_node_hidden, passage_node_cell, idx_var = tf.while_loop(
                loop_condition,
                _recurrence,
                loop_vars,
                parallel_iterations=1,
                shape_invariants=[
                    tf.TensorShape([None, None, dag_hidden_dim]),
                    tf.TensorShape([None, None, dag_hidden_dim]),
                    idx_var.get_shape(),
                ])

            # decide how to use graph_representations
            self.node_representations = passage_node_representation
            self.graph_hiddens = passage_node_hidden
            self.graph_cells = passage_node_cell

            self.batch_size = batch_size
    def create_siameseLSTM_model_graph(self,
                                       num_classes,
                                       word_vocab=None,
                                       char_vocab=None,
                                       is_training=True,
                                       global_step=None):
        """
        """
        options = self.options
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.embedding = tf.placeholder(
                    tf.float32, shape=word_vocab.word_vecs.shape)
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=self.embedding,
                    dtype=tf.float32)  # tf.constant(word_vocab.word_vecs)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        in_question_repres = tf.concat(
            axis=2,
            values=in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            axis=2, values=in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        passage_mask = tf.sequence_mask(
            self.passage_lengths, passage_len,
            dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        # ======BiLSTM context layer======
        for i in range(
                options.context_layer_num):  # support multiple context layer
            with tf.variable_scope('bilstm-layer-{}'.format(i)):
                # contextual lstm for both passage and question
                in_question_repres = tf.multiply(
                    in_question_repres, tf.expand_dims(question_mask, axis=-1))
                (question_context_representation_fw,
                 question_context_representation_bw,
                 in_question_repres) = layer_utils.my_lstm_layer(
                     in_question_repres,
                     options.context_lstm_dim,
                     input_lengths=self.question_lengths,
                     scope_name="context_represent",
                     reuse=False,
                     is_training=is_training,
                     dropout_rate=options.dropout_rate,
                     use_cudnn=options.use_cudnn)

                # Encode the second sentence, using the same LSTM weights.
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = tf.multiply(
                    in_passage_repres, tf.expand_dims(passage_mask, axis=-1))
                (passage_context_representation_fw,
                 passage_context_representation_bw,
                 in_passage_repres) = layer_utils.my_lstm_layer(
                     in_passage_repres,
                     options.context_lstm_dim,
                     input_lengths=self.passage_lengths,
                     scope_name="context_represent",
                     reuse=True,
                     is_training=is_training,
                     dropout_rate=options.dropout_rate,
                     use_cudnn=options.use_cudnn)

        if options.lstm_out_type == 'mean':
            question_context_representation_fw = layer_utils.collect_mean_step_of_lstm(
                question_context_representation_fw)
            question_context_representation_bw = layer_utils.collect_mean_step_of_lstm(
                question_context_representation_bw)
            passage_context_representation_fw = layer_utils.collect_mean_step_of_lstm(
                passage_context_representation_fw)
            passage_context_representation_bw = layer_utils.collect_mean_step_of_lstm(
                passage_context_representation_bw)
        elif options.lstm_out_type == 'end':
            question_context_representation_fw = layer_utils.collect_final_step_of_lstm(
                question_context_representation_fw, self.question_lengths - 1)
            question_context_representation_bw = question_context_representation_bw[:,
                                                                                    0, :]
            passage_context_representation_fw = layer_utils.collect_final_step_of_lstm(
                passage_context_representation_fw, self.passage_lengths - 1)
            passage_context_representation_bw = passage_context_representation_bw[:,
                                                                                  0, :]

        question_context_outputs = tf.concat(
            axis=1,
            values=[
                question_context_representation_fw,
                question_context_representation_bw
            ])
        passage_context_outputs = tf.concat(
            axis=1,
            values=[
                passage_context_representation_fw,
                passage_context_representation_bw
            ])

        (match_representation, match_dim) = match_utils.siameseLSTM_match_func(
            question_context_outputs, passage_context_outputs,
            options.context_lstm_dim)

        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.nn.relu(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        self.predictions = tf.argmax(self.prob, 1)

        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))

        if not is_training: return

        tvars = tf.trainable_variables()
        if self.options.lambda_l1 > 0.0:
            l1_loss = tf.add_n([
                tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l1_loss
        if self.options.lambda_l2 > 0.0:
            # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            l2_loss = tf.add_n([
                tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(
                learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(
                MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
    def create_mpcnn_model_graph(self,
                                 num_classes,
                                 word_vocab=None,
                                 char_vocab=None,
                                 is_training=True,
                                 global_step=None):
        """
        """
        options = self.options
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.embedding = tf.placeholder(
                    tf.float32, shape=word_vocab.word_vecs.shape)
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=self.embedding,
                    dtype=tf.float32)  # tf.constant(word_vocab.word_vecs)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        in_question_repres = tf.concat(
            axis=2,
            values=in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            axis=2, values=in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths,
                                passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        in_question_repres = tf.expand_dims(
            in_question_repres, -1)  # [batch_size, question_len, word_dim, 1]
        in_passage_repres = tf.expand_dims(
            in_passage_repres, -1)  # [batch_size, passage_len, word_dim, 1]

        # ======Multi-perspective CNN Matching======
        filter_sizes = options.filter_sizes
        num_filters = options.num_filters
        poolings = list([tf.reduce_max, tf.reduce_min,
                         tf.reduce_mean])[:options.num_poolings]

        W1 = [
            tf.get_variable(
                "W1_%s" % i,
                initializer=tf.truncated_normal(
                    [filter_sizes[i], input_dim, 1, num_filters[0]],
                    stddev=0.1),
                dtype=tf.float32) for i in range(len(filter_sizes))
        ]
        b1 = [
            tf.get_variable("b1_%s" % i,
                            initializer=tf.constant(0.01,
                                                    shape=[num_filters[0]]),
                            dtype=tf.float32) for i in range(len(filter_sizes))
        ]

        W2 = [
            tf.get_variable(
                "W2_%s" % i,
                initializer=tf.truncated_normal(
                    [filter_sizes[i], input_dim, 1, num_filters[1]],
                    stddev=0.1),
                dtype=tf.float32) for i in range(len(filter_sizes) - 1)
        ]
        b2 = [
            tf.get_variable(
                "b2_%s" % i,
                initializer=tf.constant(0.01,
                                        shape=[num_filters[1], input_dim]),
                dtype=tf.float32) for i in range(len(filter_sizes) - 1)
        ]

        sent1_blockA = layer_utils.build_block_A(
            in_question_repres, filter_sizes, poolings, W1, b1, is_training
        )  # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A]
        sent2_blockA = layer_utils.build_block_A(
            in_passage_repres, filter_sizes, poolings, W1, b1, is_training
        )  # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A]

        sent1_blockB = layer_utils.build_block_B(
            in_question_repres, filter_sizes, poolings, W2, b2, is_training
        )  # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B]
        sent2_blockB = layer_utils.build_block_B(
            in_passage_repres, filter_sizes, poolings, W2, b2, is_training
        )  # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B]

        (match_representation, match_dim) = match_utils.mpcnn_match_func(
            sent1_blockA, sent2_blockA, sent1_blockB, sent2_blockB, poolings,
            filter_sizes, num_filters)

        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.nn.relu(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        self.predictions = tf.argmax(self.prob, 1)

        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))

        if not is_training: return

        if options.with_f1_metric:
            # acc, acc_op = tf.metrics.accuracy(labels=self.truth, predictions=self.predictions)
            precision, pre_op = tf.metrics.precision(
                labels=self.truth, predictions=self.predictions)
            recall, rec_op = tf.metrics.recall(labels=self.truth,
                                               predictions=self.predictions)
            f1 = 2 * precision * recall / (precision + recall + 1e-6)
            self.loss = self.loss - 0.1 * tf.reduce_mean(f1)

        tvars = tf.trainable_variables()
        if self.options.lambda_l1 > 0.0:
            l1_loss = tf.add_n([
                tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l1_loss
        if self.options.lambda_l2 > 0.0:
            # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            l2_loss = tf.add_n([
                tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(
                learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(
                MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
예제 #12
0
    def gcn_encode(self,
                   batch_nodes,
                   embedded_node_rep,
                   fw_adj_info,
                   bw_adj_info,
                   input_node_dim,
                   output_node_dim,
                   fw_aggregators,
                   bw_aggregators,
                   window_size,
                   layer_size,
                   scope,
                   agg_type,
                   sample_size_per_layer,
                   keep_inter_state=False):
        with tf.variable_scope(scope):
            single_graph_nodes_size = tf.shape(batch_nodes)[1]
            # ============ encode graph structure ==========
            fw_sampler = UniformNeighborSampler(fw_adj_info)
            bw_sampler = UniformNeighborSampler(bw_adj_info)
            nodes = tf.reshape(batch_nodes, [
                -1,
            ])

            # the fw_hidden and bw_hidden is the initial node embedding
            # [node_size, dim_size]
            fw_hidden = tf.nn.embedding_lookup(embedded_node_rep, nodes)
            bw_hidden = tf.nn.embedding_lookup(embedded_node_rep, nodes)

            # [node_size, adj_size]
            fw_sampled_neighbors = fw_sampler((nodes, sample_size_per_layer))
            bw_sampled_neighbors = bw_sampler((nodes, sample_size_per_layer))

            inter_fw_hiddens = []
            inter_bw_hiddens = []
            inter_dims = []

            if scope == "first_gcn":
                self.watch["node_1_rep_in_first_gcn"] = []

            fw_hidden_dim = input_node_dim
            # layer is the index of convolution and hop is used to combine information
            for layer in range(layer_size):
                self.watch["node_1_rep_in_first_gcn"].append(fw_hidden)

                if len(fw_aggregators) <= layer:
                    fw_aggregators.append([])
                if len(bw_aggregators) <= layer:
                    bw_aggregators.append([])
                for hop in range(window_size):
                    if hop > 6:
                        fw_aggregator = fw_aggregators[layer][6]
                    elif len(fw_aggregators[layer]) > hop:
                        fw_aggregator = fw_aggregators[layer][hop]
                    else:
                        if agg_type == "GCN":
                            fw_aggregator = GCNAggregator(fw_hidden_dim,
                                                          output_node_dim,
                                                          concat=self.concat,
                                                          dropout=self.dropout,
                                                          mode=self.mode)
                        elif agg_type == "mean_pooling":
                            fw_aggregator = MeanAggregator(
                                fw_hidden_dim,
                                output_node_dim,
                                concat=self.concat,
                                dropout=self.dropout,
                                if_use_high_way=self.with_gcn_highway,
                                mode=self.mode)
                        elif agg_type == "max_pooling":
                            fw_aggregator = MaxPoolingAggregator(
                                fw_hidden_dim,
                                output_node_dim,
                                concat=self.concat,
                                dropout=self.dropout,
                                mode=self.mode)
                        elif agg_type == "lstm":
                            fw_aggregator = SeqAggregator(fw_hidden_dim,
                                                          output_node_dim,
                                                          concat=self.concat,
                                                          dropout=self.dropout,
                                                          mode=self.mode)
                        elif agg_type == "att":
                            fw_aggregator = AttentionAggregator(
                                fw_hidden_dim,
                                output_node_dim,
                                concat=self.concat,
                                dropout=self.dropout,
                                mode=self.mode)

                        fw_aggregators[layer].append(fw_aggregator)

                    # [node_size, adj_size, word_embedding_dim]
                    if layer == 0 and hop == 0:
                        neigh_vec_hidden = tf.nn.embedding_lookup(
                            embedded_node_rep, fw_sampled_neighbors)
                    else:
                        neigh_vec_hidden = tf.nn.embedding_lookup(
                            tf.concat(
                                [fw_hidden,
                                 tf.zeros([1, fw_hidden_dim])], 0),
                            fw_sampled_neighbors)

                    # if self.with_gcn_highway:
                    #     # we try to forget something when introducing the neighbor information
                    #     with tf.variable_scope("fw_hidden_highway"):
                    #         fw_hidden = multi_highway_layer(fw_hidden, fw_hidden_dim, options['highway_layer_num'])

                    bw_hidden_dim = fw_hidden_dim

                    fw_hidden, fw_hidden_dim = fw_aggregator(
                        (fw_hidden, neigh_vec_hidden))

                    if keep_inter_state:
                        inter_fw_hiddens.append(fw_hidden)
                        inter_dims.append(fw_hidden_dim)

                    if self.graph_encode_direction == "bi":
                        if hop > 6:
                            bw_aggregator = bw_aggregators[layer][6]
                        elif len(bw_aggregators[layer]) > hop:
                            bw_aggregator = bw_aggregators[layer][hop]
                        else:
                            if agg_type == "GCN":
                                bw_aggregator = GCNAggregator(
                                    bw_hidden_dim,
                                    output_node_dim,
                                    concat=self.concat,
                                    dropout=self.dropout,
                                    mode=self.mode)
                            elif agg_type == "mean_pooling":
                                bw_aggregator = MeanAggregator(
                                    bw_hidden_dim,
                                    output_node_dim,
                                    concat=self.concat,
                                    dropout=self.dropout,
                                    if_use_high_way=self.with_gcn_highway,
                                    mode=self.mode)
                            elif agg_type == "max_pooling":
                                bw_aggregator = MaxPoolingAggregator(
                                    bw_hidden_dim,
                                    output_node_dim,
                                    concat=self.concat,
                                    dropout=self.dropout,
                                    mode=self.mode)
                            elif agg_type == "lstm":
                                bw_aggregator = SeqAggregator(
                                    bw_hidden_dim,
                                    output_node_dim,
                                    concat=self.concat,
                                    dropout=self.dropout,
                                    mode=self.mode)
                            elif agg_type == "att":
                                bw_aggregator = AttentionAggregator(
                                    bw_hidden_dim,
                                    output_node_dim,
                                    concat=self.concat,
                                    mode=self.mode,
                                    dropout=self.dropout)

                            bw_aggregators[layer].append(bw_aggregator)

                        if layer == 0 and hop == 0:
                            neigh_vec_hidden = tf.nn.embedding_lookup(
                                embedded_node_rep, bw_sampled_neighbors)
                        else:
                            neigh_vec_hidden = tf.nn.embedding_lookup(
                                tf.concat(
                                    [bw_hidden,
                                     tf.zeros([1, fw_hidden_dim])], 0),
                                bw_sampled_neighbors)

                        if self.with_gcn_highway:
                            with tf.variable_scope("bw_hidden_highway"):
                                bw_hidden = multi_highway_layer(
                                    bw_hidden, fw_hidden_dim,
                                    options['highway_layer_num'])

                        bw_hidden, bw_hidden_dim = bw_aggregator(
                            (bw_hidden, neigh_vec_hidden))

                        if keep_inter_state:
                            inter_bw_hiddens.append(bw_hidden)

            node_dim = fw_hidden_dim

            # hidden stores the representation for all nodes
            fw_hidden = tf.reshape(fw_hidden,
                                   [-1, single_graph_nodes_size, node_dim])
            if self.graph_encode_direction == "bi":
                bw_hidden = tf.reshape(bw_hidden,
                                       [-1, single_graph_nodes_size, node_dim])
                hidden = tf.concat([fw_hidden, bw_hidden], axis=2)
                graph_dim = 2 * node_dim
            else:
                hidden = fw_hidden
                graph_dim = node_dim

            hidden = tf.nn.relu(hidden)
            max_pooled = tf.reduce_max(hidden, 1)
            mean_pooled = tf.reduce_mean(hidden, 1)
            res = [hidden]

            max_graph_embedding = tf.reshape(max_pooled, [-1, graph_dim])
            mean_graph_embedding = tf.reshape(mean_pooled, [-1, graph_dim])
            res.append(max_graph_embedding)
            res.append(mean_graph_embedding)
            res.append(graph_dim)

            if keep_inter_state:
                inter_node_reps = []
                inter_graph_reps = []
                inter_graph_dims = []
                # process the inter hidden states
                for _ in range(len(inter_fw_hiddens)):
                    inter_fw_hidden = inter_fw_hiddens[_]
                    inter_bw_hidden = inter_bw_hiddens[_]
                    inter_dim = inter_dims[_]
                    inter_fw_hidden = tf.reshape(
                        inter_fw_hidden,
                        [-1, single_graph_nodes_size, inter_dim])

                    if self.graph_encode_direction == "bi":
                        inter_bw_hidden = tf.reshape(
                            inter_bw_hidden,
                            [-1, single_graph_nodes_size, inter_dim])
                        inter_hidden = tf.concat(
                            [inter_fw_hidden, inter_bw_hidden], axis=2)
                        inter_graph_dim = inter_dim * 2
                    else:
                        inter_hidden = inter_fw_hidden
                        inter_graph_dim = inter_dim

                    inter_node_rep = tf.nn.relu(inter_hidden)
                    inter_node_reps.append(inter_node_rep)
                    inter_graph_dims.append(inter_graph_dim)

                    max_pooled_tmp = tf.reduce_max(inter_node_rep, 1)
                    mean_pooled_tmp = tf.reduce_max(inter_node_rep, 1)
                    max_graph_embedding = tf.reshape(max_pooled_tmp,
                                                     [-1, inter_graph_dim])
                    mean_graph_embedding = tf.reshape(mean_pooled_tmp,
                                                      [-1, inter_graph_dim])
                    inter_graph_reps.append(
                        (max_graph_embedding, mean_graph_embedding))

                res.append(inter_node_reps)
                res.append(inter_graph_reps)
                res.append(inter_graph_dims)

            return res
예제 #13
0
    def _build_graph(self):
        node_1_mask = self.batch_mask_first
        node_2_mask = self.batch_mask_second
        node_1_looking_table = self.looking_table_first
        node_2_looking_table = self.looking_table_second

        node_2_aware_representations = []
        node_2_aware_dim = 0
        node_1_aware_representations = []
        node_1_aware_dim = 0

        pad_word_embedding = tf.zeros([1, self.word_embedding_dim
                                       ])  # this is for the PAD symbol
        self.word_embeddings = tf.concat([
            pad_word_embedding,
            tf.get_variable(
                'pretrained_embedding',
                shape=[self.pretrained_word_size, self.word_embedding_dim],
                initializer=tf.constant_initializer(
                    self.pretrained_word_embeddings),
                trainable=True),
            tf.get_variable(
                'W_train',
                shape=[self.learned_word_size, self.word_embedding_dim],
                initializer=tf.contrib.layers.xavier_initializer(),
                trainable=True)
        ], 0)

        self.watch['word_embeddings'] = self.word_embeddings

        # ============ encode node feature by looking up word embedding =============
        with tf.variable_scope('node_rep_gen'):
            # [node_size, hidden_layer_dim]
            feature_embedded_chars_first = tf.nn.embedding_lookup(
                self.word_embeddings, self.feature_info_first)
            graph_1_size = tf.shape(feature_embedded_chars_first)[0]

            feature_embedded_chars_second = tf.nn.embedding_lookup(
                self.word_embeddings, self.feature_info_second)
            graph_2_size = tf.shape(feature_embedded_chars_second)[0]

            if self.node_vec_method == "lstm":
                cell = self.build_encoder_cell(1, self.hidden_layer_dim)

                outputs, hidden_states = tf.nn.dynamic_rnn(
                    cell=cell,
                    inputs=feature_embedded_chars_first,
                    sequence_length=self.feature_len_first,
                    dtype=tf.float32)
                node_1_rep = layer_utils.collect_final_step_of_lstm(
                    outputs, self.feature_len_first - 1)

                outputs, hidden_states = tf.nn.dynamic_rnn(
                    cell=cell,
                    inputs=feature_embedded_chars_second,
                    sequence_length=self.feature_len_second,
                    dtype=tf.float32)
                node_2_rep = layer_utils.collect_final_step_of_lstm(
                    outputs, self.feature_len_second - 1)

            elif self.node_vec_method == "word_emb":
                node_1_rep = tf.reshape(feature_embedded_chars_first,
                                        [graph_1_size, -1])
                node_2_rep = tf.reshape(feature_embedded_chars_second,
                                        [graph_2_size, -1])

            self.watch["node_1_rep_initial"] = node_1_rep

        # ============ encode node feature by GCN =============
        with tf.variable_scope('first_gcn') as first_gcn_scope:
            # shape of node embedding: [batch_size, single_graph_nodes_size, node_embedding_dim]
            # shape of node size: [batch_size]
            gcn_1_res = self.gcn_encode(
                self.batch_nodes_first,
                node_1_rep,
                self.fw_adj_info_first,
                self.bw_adj_info_first,
                input_node_dim=self.word_embedding_dim,
                output_node_dim=self.aggregator_dim_first,
                fw_aggregators=self.fw_aggregators_first,
                bw_aggregators=self.bw_aggregators_first,
                window_size=self.gcn_window_size_first,
                layer_size=self.gcn_layer_size_first,
                scope="first_gcn",
                agg_type=self.agg_type_first,
                sample_size_per_layer=self.sample_size_per_layer_first,
                keep_inter_state=self.if_use_multiple_gcn_1_state)

            node_1_rep = gcn_1_res[0]
            node_1_rep_dim = gcn_1_res[3]

            gcn_2_res = self.gcn_encode(
                self.batch_nodes_second,
                node_2_rep,
                self.fw_adj_info_second,
                self.bw_adj_info_second,
                input_node_dim=self.word_embedding_dim,
                output_node_dim=self.aggregator_dim_first,
                fw_aggregators=self.fw_aggregators_first,
                bw_aggregators=self.bw_aggregators_first,
                window_size=self.gcn_window_size_first,
                layer_size=self.gcn_layer_size_first,
                scope="first_gcn",
                agg_type=self.agg_type_first,
                sample_size_per_layer=self.sample_size_per_layer_second,
                keep_inter_state=self.if_use_multiple_gcn_1_state)

            node_2_rep = gcn_2_res[0]
            node_2_rep_dim = gcn_2_res[3]

        self.watch["node_1_rep_first_GCN"] = node_1_rep
        self.watch["node_1_mask"] = node_1_mask

        # mask
        node_1_rep = tf.multiply(node_1_rep, tf.expand_dims(node_1_mask, 2))
        node_2_rep = tf.multiply(node_2_rep, tf.expand_dims(node_2_mask, 2))

        self.watch["node_1_rep_first_GCN_masked"] = node_1_rep

        if self.pred_method == "node_level":
            entity_1_rep = tf.reshape(
                tf.nn.embedding_lookup(tf.transpose(node_1_rep, [1, 0, 2]),
                                       tf.constant(0)), [-1, node_1_rep_dim])
            entity_2_rep = tf.reshape(
                tf.nn.embedding_lookup(tf.transpose(node_2_rep, [1, 0, 2]),
                                       tf.constant(0)), [-1, node_2_rep_dim])

            entity_1_2_diff = entity_1_rep - entity_2_rep
            entity_1_2_sim = entity_1_rep * entity_2_rep

            aggregation = tf.concat(
                [entity_1_rep, entity_2_rep, entity_1_2_diff, entity_1_2_sim],
                axis=1)
            aggregation_dim = 4 * node_1_rep_dim

            w_0 = tf.get_variable("w_0",
                                  [aggregation_dim, aggregation_dim / 2],
                                  dtype=tf.float32)
            b_0 = tf.get_variable("b_0", [aggregation_dim / 2],
                                  dtype=tf.float32)
            w_1 = tf.get_variable("w_1", [aggregation_dim / 2, 2],
                                  dtype=tf.float32)
            b_1 = tf.get_variable("b_1", [2], dtype=tf.float32)

            # ====== Prediction Layer ===============
            logits = tf.matmul(aggregation, w_0) + b_0
            logits = tf.tanh(logits)
            logits = tf.matmul(logits, w_1) + b_1

        elif self.pred_method == "graph_level":
            # if the prediction method is graph_level, we perform the graph matching based prediction

            assert node_1_rep_dim == node_2_rep_dim
            input_dim = node_1_rep_dim

            with tf.variable_scope('node_level_matching') as matching_scope:
                # ========= node level matching ===============
                (match_reps,
                 match_dim) = match_graph_1_with_graph_2(node_1_rep,
                                                         node_2_rep,
                                                         node_1_mask,
                                                         node_2_mask,
                                                         input_dim,
                                                         options=options,
                                                         watch=self.watch)

                matching_scope.reuse_variables()

                node_2_aware_representations.append(match_reps)
                node_2_aware_dim += match_dim

                (match_reps,
                 match_dim) = match_graph_1_with_graph_2(node_2_rep,
                                                         node_1_rep,
                                                         node_2_mask,
                                                         node_1_mask,
                                                         input_dim,
                                                         options=options,
                                                         watch=self.watch)

                node_1_aware_representations.append(match_reps)
                node_1_aware_dim += match_dim

            # TODO: add one more MP matching over the graph representation
            # with tf.variable_scope('context_MP_matching'):
            #     for i in range(options['context_layer_num']):
            #         with tf.variable_scope('layer-{}',format(i)):

            # [batch_size, single_graph_nodes_size, node_2_aware_dim]
            node_2_aware_representations = tf.concat(
                axis=2, values=node_2_aware_representations)

            # [batch_size, single_graph_nodes_size, node_1_aware_dim]
            node_1_aware_representations = tf.concat(
                axis=2, values=node_1_aware_representations)

            # if self.mode == "train":
            #     node_2_aware_representations = tf.nn.dropout(node_2_aware_representations, (1 - options['dropout_rate']))
            #     node_1_aware_representations = tf.nn.dropout(node_1_aware_representations, (1 - options['dropout_rate']))

            # ========= Highway layer ==============
            if self.with_match_highway:
                with tf.variable_scope("left_matching_highway"):
                    node_2_aware_representations = multi_highway_layer(
                        node_2_aware_representations, node_2_aware_dim,
                        options['highway_layer_num'])
                with tf.variable_scope("right_matching_highway"):
                    node_1_aware_representations = multi_highway_layer(
                        node_1_aware_representations, node_1_aware_dim,
                        options['highway_layer_num'])

            self.watch["node_1_rep_match"] = node_2_aware_representations

            # ========= Aggregation Layer ==============
            aggregation_representation = []
            aggregation_dim = 0

            node_2_aware_aggregation_input = node_2_aware_representations
            node_1_aware_aggregation_input = node_1_aware_representations

            self.watch[
                "node_1_rep_match_layer"] = node_2_aware_aggregation_input

            with tf.variable_scope('aggregation_layer'):
                # TODO: now we only have 1 aggregation layer; need to change this part if support more aggregation layers
                # [batch_size, single_graph_nodes_size, node_2_aware_dim]
                node_2_aware_aggregation_input = tf.multiply(
                    node_2_aware_aggregation_input,
                    tf.expand_dims(node_1_mask, axis=-1))

                # [batch_size, single_graph_nodes_size, node_1_aware_dim]
                node_1_aware_aggregation_input = tf.multiply(
                    node_1_aware_aggregation_input,
                    tf.expand_dims(node_2_mask, axis=-1))

                if self.agg_sim_method == "GCN":
                    # [batch_size*single_graph_nodes_size, node_2_aware_dim]
                    node_2_aware_aggregation_input = tf.reshape(
                        node_2_aware_aggregation_input,
                        shape=[-1, node_2_aware_dim])

                    # [batch_size*single_graph_nodes_size, node_1_aware_dim]
                    node_1_aware_aggregation_input = tf.reshape(
                        node_1_aware_aggregation_input,
                        shape=[-1, node_1_aware_dim])

                    # [node_1_size, node_2_aware_dim]
                    node_1_rep = tf.concat([
                        tf.nn.embedding_lookup(node_2_aware_aggregation_input,
                                               node_1_looking_table),
                        tf.zeros([1, node_2_aware_dim])
                    ], 0)

                    # [node_2_size, node_1_aware_dim]
                    node_2_rep = tf.concat([
                        tf.nn.embedding_lookup(node_1_aware_aggregation_input,
                                               node_2_looking_table),
                        tf.zeros([1, node_1_aware_dim])
                    ], 0)

                    gcn_1_res = self.gcn_encode(
                        self.batch_nodes_first,
                        node_1_rep,
                        self.fw_adj_info_first,
                        self.bw_adj_info_first,
                        input_node_dim=node_2_aware_dim,
                        output_node_dim=self.aggregator_dim_second,
                        fw_aggregators=self.fw_aggregators_second,
                        bw_aggregators=self.bw_aggregators_second,
                        window_size=self.gcn_window_size_second,
                        layer_size=self.gcn_layer_size_second,
                        scope="second_gcn",
                        agg_type=self.agg_type_second,
                        sample_size_per_layer=self.sample_size_per_layer_first,
                        keep_inter_state=self.if_use_multiple_gcn_2_state)

                    max_graph_1_rep = gcn_1_res[1]
                    mean_graph_1_rep = gcn_1_res[2]
                    graph_1_rep_dim = gcn_1_res[3]

                    gcn_2_res = self.gcn_encode(
                        self.batch_nodes_second,
                        node_2_rep,
                        self.fw_adj_info_second,
                        self.bw_adj_info_second,
                        input_node_dim=node_1_aware_dim,
                        output_node_dim=self.aggregator_dim_second,
                        fw_aggregators=self.fw_aggregators_second,
                        bw_aggregators=self.bw_aggregators_second,
                        window_size=self.gcn_window_size_second,
                        layer_size=self.gcn_layer_size_second,
                        scope="second_gcn",
                        agg_type=self.agg_type_second,
                        sample_size_per_layer=self.
                        sample_size_per_layer_second,
                        keep_inter_state=self.if_use_multiple_gcn_2_state)

                    max_graph_2_rep = gcn_2_res[1]
                    mean_graph_2_rep = gcn_2_res[2]
                    graph_2_rep_dim = gcn_2_res[3]

                    assert graph_1_rep_dim == graph_2_rep_dim

                    if self.if_use_multiple_gcn_2_state:
                        graph_1_reps = gcn_1_res[5]
                        graph_2_reps = gcn_2_res[5]
                        inter_dims = gcn_1_res[6]
                        for idx in range(len(graph_1_reps)):
                            (max_graph_1_rep_tmp,
                             mean_graph_1_rep_tmp) = graph_1_reps[idx]
                            (max_graph_2_rep_tmp,
                             mean_graph_2_rep_tmp) = graph_2_reps[idx]
                            inter_dim = inter_dims[idx]
                            aggregation_representation.append(
                                max_graph_1_rep_tmp)
                            aggregation_representation.append(
                                mean_graph_1_rep_tmp)
                            aggregation_representation.append(
                                max_graph_2_rep_tmp)
                            aggregation_representation.append(
                                mean_graph_2_rep_tmp)
                            aggregation_dim += 4 * inter_dim

                    else:
                        aggregation_representation.append(max_graph_1_rep)
                        aggregation_representation.append(mean_graph_1_rep)
                        aggregation_representation.append(max_graph_2_rep)
                        aggregation_representation.append(mean_graph_2_rep)
                        aggregation_dim = 4 * graph_1_rep_dim

                    # aggregation_representation = tf.concat(aggregation_representation, axis=1)

                    gcn_2_window_size = int(
                        len(aggregation_representation) / 4)
                    aggregation_dim = aggregation_dim / gcn_2_window_size

                    w_0 = tf.get_variable(
                        "w_0", [aggregation_dim, aggregation_dim / 2],
                        dtype=tf.float32)
                    b_0 = tf.get_variable("b_0", [aggregation_dim / 2],
                                          dtype=tf.float32)
                    w_1 = tf.get_variable("w_1", [aggregation_dim / 2, 2],
                                          dtype=tf.float32)
                    b_1 = tf.get_variable("b_1", [2], dtype=tf.float32)

                    weights = tf.get_variable("gcn_2_window_weights",
                                              [gcn_2_window_size],
                                              dtype=tf.float32)

                    # shape: [gcn_2_window_size, batch_size, 2]
                    logits = []
                    for layer_idx in range(gcn_2_window_size):
                        max_graph_1_rep = aggregation_representation[
                            layer_idx * 4 + 0]
                        mean_graph_1_rep = aggregation_representation[
                            layer_idx * 4 + 1]
                        max_graph_2_rep = aggregation_representation[
                            layer_idx * 4 + 2]
                        mean_graph_2_rep = aggregation_representation[
                            layer_idx * 4 + 3]

                        aggregation_representation_single = tf.concat([
                            max_graph_1_rep, mean_graph_1_rep, max_graph_2_rep,
                            mean_graph_2_rep
                        ],
                                                                      axis=1)

                        # ====== Prediction Layer ===============
                        logit = tf.matmul(aggregation_representation_single,
                                          w_0) + b_0
                        logit = tf.tanh(logit)
                        logit = tf.matmul(logit, w_1) + b_1
                        logits.append(logit)

                    if len(logits) != 1:
                        logits = tf.reshape(tf.concat(logits, axis=0),
                                            [gcn_2_window_size, -1, 2])
                        logits = tf.transpose(logits, [1, 0, 2])
                        logits = tf.multiply(logits,
                                             tf.expand_dims(weights, axis=-1))
                        logits = tf.reduce_sum(logits, axis=1)
                    else:
                        logits = tf.reshape(logits, [-1, 2])

        # ====== Highway layer ============
        # if options['with_aggregation_highway']:

        with tf.name_scope("loss"):
            self.y_pred = tf.nn.softmax(logits)
            self.loss = tf.reduce_sum(
                tf.nn.softmax_cross_entropy_with_logits(
                    labels=self.y_true,
                    logits=logits, name="xentropy_loss")) / tf.cast(
                        self.batch_size, tf.float32)

        # ============  Training Objective ===========================
        if self.mode == "train" and not self.if_pred_on_dev:
            optimizer = tf.train.AdamOptimizer()
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1)
            self.training_op = optimizer.apply_gradients(
                zip(clipped_gradients, params))
예제 #14
0
    def encode(self, is_training=True):
        options = self.options

        # ======word representation layer======
        in_passage_repres = []
        input_dim = 0
        if options.with_word and self.word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.variable_scope("embedding"), tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(self.word_vocab.word_vecs),
                    dtype=tf.float32)

            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding, self.in_passage_words)

            ## Position encoding
            # print('in_passage_word_repres: ', tf.shape(in_passage_word_repres))
            in_passage_word_repres += positional_encoding(
                in_passage_word_repres, options.max_answer_len)
            # print('in_passage_word_repres: ', tf.shape(in_passage_word_repres)[2])
            ## Position encoding

            # [batch_size, passage_len, word_dim]
            in_passage_repres.append(in_passage_word_repres)
            # print('in_passage_repres: ', tf.shape(in_passage_repres))

            input_shape = tf.shape(self.in_passage_words)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            input_dim += self.word_vocab.word_dim

        if options.with_char and self.char_vocab is not None:
            input_shape = tf.shape(self.in_passage_chars)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = self.char_vocab.word_dim
            self.char_embedding = tf.get_variable(
                "char_embedding",
                initializer=tf.constant(self.char_vocab.word_vecs),
                dtype=tf.float32)
            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(
                    options.char_lstm_dim)
                # dropout
                if is_training:
                    char_lstm_cell = tf.contrib.rnn.DropoutWrapper(
                        char_lstm_cell,
                        output_keep_prob=(1 - options.dropout_rate))
                char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell])
                # passage representation
                passage_char_outputs = tf.nn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32)[0]
                # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = collect_final_step_lstm(
                    passage_char_outputs, passage_char_lengths - 1)
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, options.char_lstm_dim])

            in_passage_repres.append(passage_char_outputs)
            input_dim += options.char_lstm_dim

        in_passage_repres = tf.concat(in_passage_repres,
                                      2)  # [batch_size, passage_len, dim]

        if options.compress_input:  # compress input word vector into smaller vectors
            w_compress = tf.get_variable(
                "w_compress_input", [input_dim, options.compress_input_dim],
                dtype=tf.float32)
            b_compress = tf.get_variable("b_compress_input",
                                         [options.compress_input_dim],
                                         dtype=tf.float32)

            in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim])
            in_passage_repres = tf.matmul(in_passage_repres,
                                          w_compress) + b_compress
            in_passage_repres = tf.tanh(in_passage_repres)
            in_passage_repres = tf.reshape(
                in_passage_repres,
                [batch_size, passage_len, options.compress_input_dim])
            input_dim = options.compress_input_dim

        in_passage_repres = tf.nn.dropout(in_passage_repres,
                                          (1 - options.dropout_rate))
        # if is_training:
        #     in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate))
        # else:
        #     in_passage_repres = tf.multiply(in_passage_repres, (1 - options.dropout_rate))

        passage_mask = tf.sequence_mask(
            self.passage_lengths, passage_len,
            dtype=tf.float32)  # [batch_size, passage_len]

        ## Blocks
        for i in range(options.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=in_passage_word_repres,
                                          keys=in_passage_repres,
                                          values=in_passage_repres,
                                          num_heads=options.num_heads,
                                          dropout_rate=options.dropout_rate,
                                          training=is_training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[options.d_ff, options.d_model])
        ## Blocks
        memory = enc
        # sequential context matching
        passage_forward = None
        passage_backward = None
        all_passage_representation = []
        passage_dim = 0
        with_lstm = True
        if with_lstm:
            with tf.variable_scope('biLSTM'):
                # cur_in_passage_repres = in_passage_repres
                cur_in_passage_repres = enc
                for i in xrange(options.context_layer_num):
                    with tf.variable_scope('layer-{}'.format(i)):
                        with tf.variable_scope('context_represent'):
                            # parameters
                            context_lstm_cell_fw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            context_lstm_cell_bw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            if is_training:
                                context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_fw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))
                                context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_bw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))

                            # passage representation
                            ((passage_context_representation_fw,
                              passage_context_representation_bw),
                             (passage_forward, passage_backward
                              )) = tf.nn.bidirectional_dynamic_rnn(
                                  context_lstm_cell_fw,
                                  context_lstm_cell_bw,
                                  cur_in_passage_repres,
                                  dtype=tf.float32,
                                  sequence_length=self.passage_lengths
                              )  # [batch_size, passage_len, context_lstm_dim]
                            if options.direction == 'forward':
                                # [batch_size, passage_len, context_lstm_dim]
                                cur_in_passage_repres = passage_context_representation_fw
                                passage_dim += options.context_lstm_dim
                            elif options.direction == 'backward':
                                # [batch_size, passage_len, context_lstm_dim]
                                cur_in_passage_repres = passage_context_representation_bw
                                passage_dim += options.context_lstm_dim
                            elif options.direction == 'bidir':
                                # [batch_size, passage_len, 2*context_lstm_dim]
                                cur_in_passage_repres = tf.concat([
                                    passage_context_representation_fw,
                                    passage_context_representation_bw
                                ], 2)
                                passage_dim += 2 * options.context_lstm_dim
                            else:
                                assert False
                            all_passage_representation.append(
                                cur_in_passage_repres)

        all_passage_representation = tf.concat(
            all_passage_representation,
            2)  # [batch_size, passage_len, passage_dim]

        if is_training:
            all_passage_representation = tf.nn.dropout(
                all_passage_representation, (1 - options.dropout_rate))
        else:
            all_passage_representation = tf.multiply(
                all_passage_representation, (1 - options.dropout_rate))

        # ======Highway layer======
        if options.with_match_highway:
            with tf.variable_scope("context_highway"):
                all_passage_representation = match_utils.multi_highway_layer(
                    all_passage_representation, passage_dim,
                    options.highway_layer_num)

        all_passage_representation = all_passage_representation * tf.expand_dims(
            passage_mask, axis=-1)

        # initial state for the LSTM decoder
        #'''
        with tf.variable_scope('initial_state_for_decoder'):
            # Define weights and biases to reduce the cell and reduce the state
            w_reduce_c = tf.get_variable(
                'w_reduce_c',
                [2 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            w_reduce_h = tf.get_variable(
                'w_reduce_h',
                [2 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            bias_reduce_c = tf.get_variable('bias_reduce_c',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)
            bias_reduce_h = tf.get_variable('bias_reduce_h',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)

            old_c = tf.concat(values=[passage_forward.c, passage_backward.c],
                              axis=1)
            old_h = tf.concat(values=[passage_forward.h, passage_backward.h],
                              axis=1)
            new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c)
            new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h)

            init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
        '''
        new_c = tf.zeros([batch_size, options.gen_hidden_size])
        new_h = tf.zeros([batch_size, options.gen_hidden_size])
        init_state = LSTMStateTuple(new_c, new_h)
        '''
        return (passage_dim, all_passage_representation, init_state, memory)
예제 #15
0
    def __init__(self, num_classes, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None,
                 dropout_rate=0.5, learning_rate=0.001, optimize_type='adam',lambda_l2=1e-5, 
                 with_word=True, with_char=True, with_POS=True, with_NER=True, 
                 char_lstm_dim=20, context_lstm_dim=100, aggregation_lstm_dim=200, is_training=True,filter_layer_threshold=0.2,
                 MP_dim=50, context_layer_num=1,aggregation_layer_num=1, fix_word_vec=False,with_filter_layer=True, with_highway=False,
                 with_lex_features=False,lex_dim=100,word_level_MP_dim=-1,sep_endpoint=False,end_model_combine=False,with_match_highway=False,
                 with_aggregation_highway=False,highway_layer_num=1,with_lex_decomposition=False, lex_decompsition_dim=-1,
                 with_left_match=True, with_right_match=True,
                 with_full_match=True, with_maxpool_match=True, with_attentive_match=True, with_max_attentive_match=True):

        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        self.question_lengths = tf.placeholder(tf.int32, [None])
        self.passage_lengths = tf.placeholder(tf.int32, [None])
        self.truth = tf.placeholder(tf.int32, [None]) # [batch_size]
        input_dim = 0
        if with_word and word_vocab is not None: 
            self.in_question_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len]
            self.in_passage_words = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len]
#             self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if fix_word_vec: 
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, 
                                                  initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)

            in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim
            
        if with_POS and POS_vocab is not None: 
            self.in_question_POSs = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len]
            self.in_passage_POSs = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len]
#             self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32)
            self.POS_embedding = tf.get_variable("POS_embedding", initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32)

            in_question_POS_repres = tf.nn.embedding_lookup(self.POS_embedding, self.in_question_POSs) # [batch_size, question_len, POS_dim]
            in_passage_POS_repres = tf.nn.embedding_lookup(self.POS_embedding, self.in_passage_POSs) # [batch_size, passage_len, POS_dim]
            in_question_repres.append(in_question_POS_repres)
            in_passage_repres.append(in_passage_POS_repres)

            input_shape = tf.shape(self.in_question_POSs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_POSs)
            passage_len = input_shape[1]
            input_dim += POS_vocab.word_dim

        if with_NER and NER_vocab is not None: 
            self.in_question_NERs = tf.placeholder(tf.int32, [None, None]) # [batch_size, question_len]
            self.in_passage_NERs = tf.placeholder(tf.int32, [None, None]) # [batch_size, passage_len]
#             self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32)
            self.NER_embedding = tf.get_variable("NER_embedding", initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32)

            in_question_NER_repres = tf.nn.embedding_lookup(self.NER_embedding, self.in_question_NERs) # [batch_size, question_len, NER_dim]
            in_passage_NER_repres = tf.nn.embedding_lookup(self.NER_embedding, self.in_passage_NERs) # [batch_size, passage_len, NER_dim]
            in_question_repres.append(in_question_NER_repres)
            in_passage_repres.append(in_passage_NER_repres)

            input_shape = tf.shape(self.in_question_NERs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_NERs)
            passage_len = input_shape[1]
            input_dim += NER_vocab.word_dim

        if with_char and char_vocab is not None: 
            self.question_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, question_len]
            self.passage_char_lengths = tf.placeholder(tf.int32, [None,None]) # [batch_size, passage_len]
            self.in_question_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, question_len, q_char_len]
            self.in_passage_chars = tf.placeholder(tf.int32, [None, None, None]) # [batch_size, passage_len, p_char_len]
            input_shape = tf.shape(self.in_question_chars)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = char_vocab.word_dim
#             self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)
            self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths, [-1])
            in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(char_lstm_dim)
                # dropout
                if is_training: char_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(char_lstm_cell, output_keep_prob=(1 - dropout_rate))
                char_lstm_cell = tf.nn.rnn_cell.MultiRNNCell([char_lstm_cell])

                # question_representation
                question_char_outputs = my_rnn.dynamic_rnn(char_lstm_cell, in_question_char_repres, 
                        sequence_length=question_char_lengths,dtype=tf.float32)[0] # [batch_size*question_len, q_char_len, char_lstm_dim]
                question_char_outputs = question_char_outputs[:,-1,:]
                question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, char_lstm_dim])
             
                tf.get_variable_scope().reuse_variables()
                # passage representation
                passage_char_outputs = my_rnn.dynamic_rnn(char_lstm_cell, in_passage_char_repres, 
                        sequence_length=passage_char_lengths,dtype=tf.float32)[0] # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = passage_char_outputs[:,-1,:]
                passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, char_lstm_dim])
                
            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)

            input_dim += char_lstm_dim

        in_question_repres = tf.concat(2, in_question_repres) # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(2, in_passage_repres) # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate))
        else:
            in_question_repres = tf.mul(in_question_repres, (1 - dropout_rate))
            in_passage_repres = tf.mul(in_passage_repres, (1 - dropout_rate))
        


        mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len]
        question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len]

        # ======Highway layer======
        if with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, highway_layer_num)
        
        # ========Bilateral Matching=====
        (match_representation, match_dim) = match_utils.bilateral_match_func2(in_question_repres, in_passage_repres,
                        self.question_lengths, self.passage_lengths, question_mask, mask, MP_dim, input_dim, 
                        with_filter_layer, context_layer_num, context_lstm_dim,is_training,dropout_rate,
                        with_match_highway,aggregation_layer_num, aggregation_lstm_dim,highway_layer_num,
                        with_aggregation_highway,with_lex_decomposition,lex_decompsition_dim,
                        with_full_match, with_maxpool_match, with_attentive_match, with_max_attentive_match,
                        with_left_match, with_right_match)

        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32)

        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.tanh(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - dropout_rate))
        else:
            logits = tf.mul(logits, (1 - dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        
#         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example')
#         self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy')

        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
#         gold_matrix = tf.one_hot(self.truth, num_classes)
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        self.predictions = tf.arg_max(self.prob, 1)

        if optimize_type == 'adadelta':
            clipper = 50 
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 
        elif optimize_type == 'sgd':
            self.global_step = tf.Variable(0, name='global_step', trainable=False) # Create a variable to track the global step.
            min_lr = 0.000001
            self._lr_rate = tf.maximum(min_lr, tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98))
            self.train_op = tf.train.GradientDescentOptimizer(learning_rate=self._lr_rate).minimize(self.loss)
        elif optimize_type == 'ema':
            tvars = tf.trainable_variables()
            train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
            # Create an ExponentialMovingAverage object
            ema = tf.train.ExponentialMovingAverage(decay=0.9999)
            # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1.
            maintain_averages_op = ema.apply(tvars)
            # Create an op that will update the moving averages after each training
            # step.  This is what we will use in place of the usual training op.
            with tf.control_dependencies([train_op]):
                self.train_op = tf.group(maintain_averages_op)
        elif optimize_type == 'adam':
            clipper = 50 
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars)) 

        extra_train_ops = []
        train_ops = [self.train_op] + extra_train_ops
        self.train_op = tf.group(*train_ops)
예제 #16
0
    def __init__(self,
                 num_classes,
                 word_vocab=None,
                 dropout_rate=0.5,
                 learning_rate=0.001,
                 optimize_type='adam',
                 lambda_l2=1e-5,
                 with_word=True,
                 context_lstm_dim=100,
                 aggregation_lstm_dim=200,
                 is_training=True,
                 MP_dim=50,
                 context_layer_num=1,
                 aggregation_layer_num=1,
                 fix_word_vec=True,
                 with_filter_layer=True,
                 with_highway=True,
                 with_match_highway=False,
                 with_aggregation_highway=False,
                 highway_layer_num=1,
                 with_lex_decomposition=False,
                 lex_decompsition_dim=-1,
                 with_left_match=True,
                 with_right_match=True,
                 with_full_match=True,
                 with_maxpool_match=True,
                 with_attentive_match=True,
                 with_max_attentive_match=True):
        with tf.name_scope("Train" if is_training else "Test"):
            # ======word representation layer======
            in_question_repres = []
            in_passage_repres = []

            self.question_lengths = tf.placeholder(tf.int32, [None],
                                                   name="question_lengths")  # [batch_size]:[2,2,3,...,10]
            self.passage_lengths = tf.placeholder(tf.int32, [None], name="passage_lengths")
            self.truth = tf.placeholder(tf.int32, [None], name="truth")  # [batch_size]

            print ("self.truth.name: ", self.truth.name)
            input_dim = 0
            if with_word and word_vocab is not None:
                self.in_question_words = tf.placeholder(tf.int32, [None, None],
                                                        name="question_words")  # [batch_size, question_len]
                self.in_passage_words = tf.placeholder(tf.int32, [None, None],
                                                       name="passage_words")  # [batch_size, passage_len]
                print ("self.in_passage_words.name: ", self.in_passage_words.name)

                word_vec_trainable = True
                cur_device = '/gpu:0'
                if fix_word_vec:
                    word_vec_trainable = False
                    cur_device = '/cpu:0'
                with tf.device(cur_device):
                    wordInitial = tf.constant(word_vocab.word_vecs)
                    self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable,
                                                          initializer=wordInitial,
                                                          dtype=tf.float32)

                in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding,
                                                                 self.in_question_words)  # [batch_size, question_len, word_dim]
                in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding,
                                                                self.in_passage_words)  # [batch_size, passage_len, word_dim]

                in_question_repres.append(in_question_word_repres)  # [1, batch_size, question_len, word_dim]
                in_passage_repres.append(in_passage_word_repres)

                input_shape = tf.shape(self.in_question_words)  # [batch_size, question_len]
                batch_size = input_shape[0]
                question_len = input_shape[1]

                input_shape = tf.shape(self.in_passage_words)  # [batch_size, question_len]
                passage_len = input_shape[1]
                input_dim += len(word_vocab.word_vecs[0])
                print("input_dim:", input_dim)

            self.in_ques = in_question_repres
            self.in_question_repres = in_question_repres = tf.concat(in_question_repres,
                                                                     2)  # [batch_size, question_len, dim]
            in_passage_repres = tf.concat(in_passage_repres, 2)  # [batch_size, passage_len, dim]

            if is_training:
                in_question_repres = tf.nn.dropout(in_question_repres, (1 - dropout_rate))
                in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - dropout_rate))
            else:
                in_question_repres = tf.multiply(in_question_repres, (1 - dropout_rate))
                in_passage_repres = tf.multiply(in_passage_repres, (1 - dropout_rate))

            '''补充0到passage_len长度    
                [[1. 1. 1. 1. 1. 1. 0. 0.]
                 [1. 1. 1. 1. 1. 1. 0. 0.]
                 [1. 1. 1. 1. 1. 1. 0. 0.]]
            '''
            mask = tf.sequence_mask(self.passage_lengths, passage_len,
                                    dtype=tf.float32)  # [batch_size, passage_len],
            question_mask = tf.sequence_mask(self.question_lengths, question_len,
                                             dtype=tf.float32)  # [batch_size, question_len]

            # ======Highway layer======
            if with_highway:
                with tf.variable_scope("input_highway"):
                    in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim,
                                                                         highway_layer_num)
                    tf.get_variable_scope().reuse_variables()
                    in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, highway_layer_num)

            # ========Bilateral Matching=====
            (match_representation, match_dim) = match_utils.bilateral_match_func2(in_question_repres, in_passage_repres,
                                                                                  self.question_lengths,
                                                                                  self.passage_lengths, question_mask,
                                                                                  mask, MP_dim, input_dim,
                                                                                  with_filter_layer, context_layer_num,
                                                                                  context_lstm_dim, is_training,
                                                                                  dropout_rate,
                                                                                  with_match_highway,
                                                                                  aggregation_layer_num,
                                                                                  aggregation_lstm_dim,
                                                                                  highway_layer_num,
                                                                                  with_aggregation_highway,
                                                                                  with_lex_decomposition,
                                                                                  lex_decompsition_dim,
                                                                                  with_full_match, with_maxpool_match,
                                                                                  with_attentive_match,
                                                                                  with_max_attentive_match,
                                                                                  with_left_match, with_right_match)

            # ========Prediction Layer=========
            w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2], dtype=tf.float32)
            b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)

            w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes], dtype=tf.float32)
            b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

            logits = tf.matmul(match_representation, w_0) + b_0
            logits = tf.tanh(logits)
            if is_training:
                logits = tf.nn.dropout(logits, (1 - dropout_rate))
            else:
                logits = tf.multiply(logits, (1 - dropout_rate))

            logits = tf.matmul(logits, w_1) + b_1
            self.prob = tf.nn.softmax(logits, name='prob')
            print "prob: ", self.prob.name

            gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix))

            correct = tf.nn.in_top_k(logits, self.truth, 1)
            self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
            self.predictions = tf.argmax(self.prob, 1)

            if optimize_type == 'adadelta':
                clipper = 50
                optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
                tvars = tf.trainable_variables()
                l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
                self.loss = self.loss + lambda_l2 * l2_loss
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars))
            elif optimize_type == 'sgd':
                self.global_step = tf.Variable(0, name='global_step',
                                               trainable=False)  # Create a variable to track the global step.
                min_lr = 0.000001
                self._lr_rate = tf.maximum(min_lr,
                                           tf.train.exponential_decay(learning_rate, self.global_step, 30000, 0.98))
                self.train_op = tf.train.GradientDescentOptimizer(learning_rate=self._lr_rate).minimize(self.loss)
            elif optimize_type == 'ema':
                tvars = tf.trainable_variables()
                train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
                # Create an ExponentialMovingAverage object
                ema = tf.train.ExponentialMovingAverage(decay=0.9999)
                # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1.
                maintain_averages_op = ema.apply(tvars)
                # Create an op that will update the moving averages after each training
                # step.  This is what we will use in place of the usual training op.
                with tf.control_dependencies([train_op]):
                    self.train_op = tf.group(maintain_averages_op)
            elif optimize_type == 'adam':
                clipper = 50
                optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
                tvars = tf.trainable_variables()
                l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
                self.loss = self.loss + lambda_l2 * l2_loss
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper)
                self.train_op = optimizer.apply_gradients(zip(grads, tvars))

            extra_train_ops = []
            train_ops = [self.train_op] + extra_train_ops
            self.train_op = tf.group(*train_ops)
예제 #17
0
    def __init__(self,
                 word_vocab=None,
                 edge_label_vocab=None,
                 char_vocab=None,
                 is_training=True,
                 options=None):
        assert options != None

        self.passage_nodes_size = tf.placeholder(tf.int32,
                                                 [None])  # [batch_size]
        self.passage_nodes = tf.placeholder(
            tf.int32, [None, None])  # [batch_size, passage_nodes_size_max]
        if options.with_char:
            self.passage_nodes_chars_size = tf.placeholder(
                tf.int32, [None, None])
            self.passage_nodes_chars = tf.placeholder(tf.int32,
                                                      [None, None, None])

        # [batch_size, passage_nodes_size_max, passage_neighbors_size_max]
        self.passage_in_neighbor_indices = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_in_neighbor_edges = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_in_neighbor_mask = tf.placeholder(tf.float32,
                                                       [None, None, None])

        # [batch_size, passage_nodes_size_max, passage_neighbors_size_max]
        self.passage_out_neighbor_indices = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_out_neighbor_edges = tf.placeholder(
            tf.int32, [None, None, None])
        self.passage_out_neighbor_mask = tf.placeholder(
            tf.float32, [None, None, None])

        # shapes
        input_shape = tf.shape(self.passage_in_neighbor_indices)
        batch_size = input_shape[0]
        passage_nodes_size_max = input_shape[1]
        passage_in_neighbors_size_max = input_shape[2]
        passage_out_neighbors_size_max = tf.shape(
            self.passage_out_neighbor_indices)[2]
        if options.with_char:
            passage_nodes_chars_size_max = tf.shape(
                self.passage_nodes_chars)[2]

        # masks
        # [batch_size, passage_nodes_size_max]
        self.passage_nodes_mask = tf.sequence_mask(self.passage_nodes_size,
                                                   passage_nodes_size_max,
                                                   dtype=tf.float32)

        # embeddings
        if options.fix_word_vec:
            word_vec_trainable = False
            cur_device = '/cpu:0'
        else:
            word_vec_trainable = True
            cur_device = '/gpu:0'
        with tf.device(cur_device):
            self.word_embedding = tf.get_variable("word_embedding",
                                                  trainable=word_vec_trainable,
                                                  initializer=tf.constant(
                                                      word_vocab.word_vecs),
                                                  dtype=tf.float32)

        self.edge_embedding = tf.get_variable("edge_embedding",
                                              initializer=tf.constant(
                                                  edge_label_vocab.word_vecs),
                                              dtype=tf.float32)

        word_dim = word_vocab.word_dim
        edge_dim = edge_label_vocab.word_dim

        if options.with_char:
            self.char_embedding = tf.get_variable("char_embedding",
                                                  initializer=tf.constant(
                                                      char_vocab.word_vecs),
                                                  dtype=tf.float32)
            char_dim = char_vocab.word_dim

        # word representation for nodes, where each node only includes one word
        # [batch_size, passage_nodes_size_max, word_dim]
        passage_node_representation = tf.nn.embedding_lookup(
            self.word_embedding, self.passage_nodes)

        if options.with_char:
            # [batch_size, passage_nodes_size_max, passage_nodes_chars_size_max, char_dim]
            passage_nodes_chars_representation = tf.nn.embedding_lookup(
                self.char_embedding, self.passage_nodes_chars)
            passage_nodes_chars_representation = tf.reshape(
                passage_nodes_chars_representation,
                shape=[
                    batch_size * passage_nodes_size_max,
                    passage_nodes_chars_size_max, char_dim
                ])
            passage_nodes_chars_size = tf.reshape(
                self.passage_nodes_chars_size,
                [batch_size * passage_nodes_size_max])
            with tf.variable_scope('node_char_lstm'):
                node_char_lstm_cell = tf.contrib.rnn.LSTMCell(
                    options.char_lstm_dim)
                node_char_lstm_cell = tf.contrib.rnn.MultiRNNCell(
                    [node_char_lstm_cell])
                # [batch_size*node_num, char_num, char_lstm_dim]
                node_char_outputs = tf.nn.dynamic_rnn(
                    node_char_lstm_cell,
                    passage_nodes_chars_representation,
                    sequence_length=passage_nodes_chars_size,
                    dtype=tf.float32)[0]
                node_char_outputs = collect_final_step_lstm(
                    node_char_outputs, passage_nodes_chars_size - 1)
                # [batch_size, node_num, char_lstm_dim]
                node_char_outputs = tf.reshape(node_char_outputs, [
                    batch_size, passage_nodes_size_max, options.char_lstm_dim
                ])

        if options.with_char:
            input_dim = word_dim + options.char_lstm_dim
            passage_node_representation = tf.concat(
                [passage_node_representation, node_char_outputs], 2)
        else:
            input_dim = word_dim
            passage_node_representation = passage_node_representation

        # apply the mask
        passage_node_representation = passage_node_representation * tf.expand_dims(
            self.passage_nodes_mask, axis=-1)

        if options.compress_input:  # compress input word vector into smaller vectors
            w_compress = tf.get_variable(
                "w_compress_input", [input_dim, options.compress_input_dim],
                dtype=tf.float32)
            b_compress = tf.get_variable("b_compress_input",
                                         [options.compress_input_dim],
                                         dtype=tf.float32)

            passage_node_representation = tf.reshape(
                passage_node_representation, [-1, input_dim])
            passage_node_representation = tf.matmul(
                passage_node_representation, w_compress) + b_compress
            passage_node_representation = tf.tanh(passage_node_representation)
            passage_node_representation = tf.reshape(passage_node_representation, \
                    [batch_size, passage_nodes_size_max, options.compress_input_dim])
            input_dim = options.compress_input_dim

        if is_training:
            passage_node_representation = tf.nn.dropout(
                passage_node_representation, (1 - options.dropout_rate))

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                passage_node_representation = match_utils.multi_highway_layer(
                    passage_node_representation, input_dim,
                    options.highway_layer_num)

        self.input_dim = input_dim

        with tf.variable_scope('graph_encoder'):
            # =========== in neighbor
            # [batch_size, passage_len, passage_neighbors_size_max, edge_dim]
            passage_in_neighbor_edge_representations = tf.nn.embedding_lookup(
                self.edge_embedding, self.passage_in_neighbor_edges)
            # [batch_size, passage_len, passage_neighbors_size_max, node_dim]
            passage_in_neighbor_node_representations = collect_neighbor_node_representations(
                passage_node_representation, self.passage_in_neighbor_indices)

            passage_in_neighbor_representations = tf.concat( \
                    [passage_in_neighbor_node_representations, passage_in_neighbor_edge_representations], 3)
            passage_in_neighbor_representations = tf.multiply(
                passage_in_neighbor_representations,
                tf.expand_dims(self.passage_in_neighbor_mask, axis=-1))
            # [batch_size, passage_len, node_dim + edge_dim]
            passage_in_neighbor_representations = tf.reduce_sum(
                passage_in_neighbor_representations, axis=2)

            # ============ out neighbor
            # [batch_size, passage_len, passage_neighbors_size_max, edge_dim]
            passage_out_neighbor_edge_representations = tf.nn.embedding_lookup(
                self.edge_embedding, self.passage_out_neighbor_edges)
            # [batch_size, passage_len, passage_neighbors_size_max, node_dim]
            passage_out_neighbor_node_representations = collect_neighbor_node_representations(
                passage_node_representation, self.passage_out_neighbor_indices)

            passage_out_neighbor_representations = tf.concat( \
                    [passage_out_neighbor_node_representations, passage_out_neighbor_edge_representations], 3)
            passage_out_neighbor_representations = tf.multiply(
                passage_out_neighbor_representations,
                tf.expand_dims(self.passage_out_neighbor_mask, axis=-1))
            # [batch_size, passage_len, node_dim + edge_dim]
            passage_out_neighbor_representations = tf.reduce_sum(
                passage_out_neighbor_representations, axis=2)

            # =====transpose neighbor_representations
            grn_hidden_dim = options.neighbor_vector_dim
            w_trans = tf.get_variable("w_trans",
                                      [input_dim + edge_dim, grn_hidden_dim],
                                      dtype=tf.float32)
            b_trans = tf.get_variable("b_trans", [grn_hidden_dim],
                                      dtype=tf.float32)

            passage_in_neighbor_representations = tf.reshape(
                passage_in_neighbor_representations,
                [-1, input_dim + edge_dim])
            passage_in_neighbor_representations = tf.matmul(
                passage_in_neighbor_representations, w_trans) + b_trans
            passage_in_neighbor_representations = tf.tanh(
                passage_in_neighbor_representations)

            passage_out_neighbor_representations = tf.reshape(
                passage_out_neighbor_representations,
                [-1, input_dim + edge_dim])
            passage_out_neighbor_representations = tf.matmul(
                passage_out_neighbor_representations, w_trans) + b_trans
            passage_out_neighbor_representations = tf.tanh(
                passage_out_neighbor_representations)

            # assume each node has a neighbor vector, and it is None at the beginning
            passage_node_hidden = tf.zeros(
                [batch_size, passage_nodes_size_max, grn_hidden_dim])
            passage_node_cell = tf.zeros(
                [batch_size, passage_nodes_size_max, grn_hidden_dim])

            w_in_ingate = tf.get_variable("w_in_ingate",
                                          [grn_hidden_dim, grn_hidden_dim],
                                          dtype=tf.float32)
            u_in_ingate = tf.get_variable("u_in_ingate",
                                          [grn_hidden_dim, grn_hidden_dim],
                                          dtype=tf.float32)
            b_ingate = tf.get_variable("b_in_ingate", [grn_hidden_dim],
                                       dtype=tf.float32)
            w_out_ingate = tf.get_variable("w_out_ingate",
                                           [grn_hidden_dim, grn_hidden_dim],
                                           dtype=tf.float32)
            u_out_ingate = tf.get_variable("u_out_ingate",
                                           [grn_hidden_dim, grn_hidden_dim],
                                           dtype=tf.float32)

            w_in_forgetgate = tf.get_variable("w_in_forgetgate",
                                              [grn_hidden_dim, grn_hidden_dim],
                                              dtype=tf.float32)
            u_in_forgetgate = tf.get_variable("u_in_forgetgate",
                                              [grn_hidden_dim, grn_hidden_dim],
                                              dtype=tf.float32)
            b_forgetgate = tf.get_variable("b_in_forgetgate", [grn_hidden_dim],
                                           dtype=tf.float32)
            w_out_forgetgate = tf.get_variable(
                "w_out_forgetgate", [grn_hidden_dim, grn_hidden_dim],
                dtype=tf.float32)
            u_out_forgetgate = tf.get_variable(
                "u_out_forgetgate", [grn_hidden_dim, grn_hidden_dim],
                dtype=tf.float32)

            w_in_outgate = tf.get_variable("w_in_outgate",
                                           [grn_hidden_dim, grn_hidden_dim],
                                           dtype=tf.float32)
            u_in_outgate = tf.get_variable("u_in_outgate",
                                           [grn_hidden_dim, grn_hidden_dim],
                                           dtype=tf.float32)
            b_outgate = tf.get_variable("b_in_outgate", [grn_hidden_dim],
                                        dtype=tf.float32)
            w_out_outgate = tf.get_variable("w_out_outgate",
                                            [grn_hidden_dim, grn_hidden_dim],
                                            dtype=tf.float32)
            u_out_outgate = tf.get_variable("u_out_outgate",
                                            [grn_hidden_dim, grn_hidden_dim],
                                            dtype=tf.float32)

            w_in_cell = tf.get_variable("w_in_cell",
                                        [grn_hidden_dim, grn_hidden_dim],
                                        dtype=tf.float32)
            u_in_cell = tf.get_variable("u_in_cell",
                                        [grn_hidden_dim, grn_hidden_dim],
                                        dtype=tf.float32)
            b_cell = tf.get_variable("b_in_cell", [grn_hidden_dim],
                                     dtype=tf.float32)
            w_out_cell = tf.get_variable("w_out_cell",
                                         [grn_hidden_dim, grn_hidden_dim],
                                         dtype=tf.float32)
            u_out_cell = tf.get_variable("u_out_cell",
                                         [grn_hidden_dim, grn_hidden_dim],
                                         dtype=tf.float32)

            # calculate question graph representation
            graph_representations = []
            for i in xrange(options.num_syntax_match_layer):
                # =============== in edge hidden
                # h_{ij} [batch_size, node_len, neighbors_size, neighbor_vector_dim]
                passage_in_edge_prev_hidden = collect_neighbor_node_representations(
                    passage_node_hidden, self.passage_in_neighbor_indices)
                passage_in_edge_prev_hidden = tf.multiply(
                    passage_in_edge_prev_hidden,
                    tf.expand_dims(self.passage_in_neighbor_mask, axis=-1))
                # [batch_size, node_len, neighbor_vector_dim]
                passage_in_edge_prev_hidden = tf.reduce_sum(
                    passage_in_edge_prev_hidden, axis=2)
                passage_in_edge_prev_hidden = tf.multiply(
                    passage_in_edge_prev_hidden,
                    tf.expand_dims(self.passage_nodes_mask, axis=-1))
                passage_in_edge_prev_hidden = tf.reshape(
                    passage_in_edge_prev_hidden, [-1, grn_hidden_dim])

                # =============== out edge hidden
                # h_{jk} [batch_size, node_len, neighbors_size, neighbor_vector_dim]
                passage_out_edge_prev_hidden = collect_neighbor_node_representations(
                    passage_node_hidden, self.passage_out_neighbor_indices)
                passage_out_edge_prev_hidden = tf.multiply(
                    passage_out_edge_prev_hidden,
                    tf.expand_dims(self.passage_out_neighbor_mask, axis=-1))
                # [batch_size, node_len, neighbor_vector_dim]
                passage_out_edge_prev_hidden = tf.reduce_sum(
                    passage_out_edge_prev_hidden, axis=2)
                passage_out_edge_prev_hidden = tf.multiply(
                    passage_out_edge_prev_hidden,
                    tf.expand_dims(self.passage_nodes_mask, axis=-1))
                passage_out_edge_prev_hidden = tf.reshape(
                    passage_out_edge_prev_hidden, [-1, grn_hidden_dim])

                ## ig
                passage_edge_ingate = tf.sigmoid(
                    tf.matmul(passage_in_neighbor_representations, w_in_ingate)
                    + tf.matmul(passage_in_edge_prev_hidden, u_in_ingate) +
                    tf.matmul(passage_out_neighbor_representations,
                              w_out_ingate) +
                    tf.matmul(passage_out_edge_prev_hidden, u_out_ingate) +
                    b_ingate)
                passage_edge_ingate = tf.reshape(
                    passage_edge_ingate,
                    [batch_size, passage_nodes_size_max, grn_hidden_dim])
                ## fg
                passage_edge_forgetgate = tf.sigmoid(
                    tf.matmul(passage_in_neighbor_representations,
                              w_in_forgetgate) +
                    tf.matmul(passage_in_edge_prev_hidden, u_in_forgetgate) +
                    tf.matmul(passage_out_neighbor_representations,
                              w_out_forgetgate) +
                    tf.matmul(passage_out_edge_prev_hidden, u_out_forgetgate) +
                    b_forgetgate)
                passage_edge_forgetgate = tf.reshape(
                    passage_edge_forgetgate,
                    [batch_size, passage_nodes_size_max, grn_hidden_dim])
                ## og
                passage_edge_outgate = tf.sigmoid(
                    tf.matmul(passage_in_neighbor_representations,
                              w_in_outgate) +
                    tf.matmul(passage_in_edge_prev_hidden, u_in_outgate) +
                    tf.matmul(passage_out_neighbor_representations,
                              w_out_outgate) +
                    tf.matmul(passage_out_edge_prev_hidden, u_out_outgate) +
                    b_outgate)
                passage_edge_outgate = tf.reshape(
                    passage_edge_outgate,
                    [batch_size, passage_nodes_size_max, grn_hidden_dim])
                ## input
                passage_edge_cell_input = tf.tanh(
                    tf.matmul(passage_in_neighbor_representations, w_in_cell) +
                    tf.matmul(passage_in_edge_prev_hidden, u_in_cell) +
                    tf.matmul(passage_out_neighbor_representations, w_out_cell)
                    + tf.matmul(passage_out_edge_prev_hidden, u_out_cell) +
                    b_cell)
                passage_edge_cell_input = tf.reshape(
                    passage_edge_cell_input,
                    [batch_size, passage_nodes_size_max, grn_hidden_dim])

                passage_edge_cell = passage_edge_forgetgate * passage_node_cell + passage_edge_ingate * passage_edge_cell_input
                passage_edge_hidden = passage_edge_outgate * tf.tanh(
                    passage_edge_cell)
                # node mask
                # [batch_size, passage_len, neighbor_vector_dim]
                passage_node_cell = tf.multiply(
                    passage_edge_cell,
                    tf.expand_dims(self.passage_nodes_mask, axis=-1))
                passage_node_hidden = tf.multiply(
                    passage_edge_hidden,
                    tf.expand_dims(self.passage_nodes_mask, axis=-1))

                graph_representations.append(passage_node_hidden)

            # decide how to use graph_representations
            self.graph_representations = graph_representations
            self.node_representations = passage_node_representation
            self.graph_hiddens = passage_node_hidden
            self.graph_cells = passage_node_cell

            self.batch_size = batch_size
예제 #18
0
    def __init__(self,
                 num_classes,
                 word_vocab=None,
                 char_vocab=None,
                 POS_vocab=None,
                 NER_vocab=None,
                 dropout_rate=0.5,
                 learning_rate=0.001,
                 optimize_type='adam',
                 lambda_l2=1e-5,
                 with_word=True,
                 with_char=True,
                 with_POS=True,
                 with_NER=True,
                 char_lstm_dim=20,
                 context_lstm_dim=100,
                 aggregation_lstm_dim=200,
                 is_training=True,
                 filter_layer_threshold=0.2,
                 MP_dim=50,
                 context_layer_num=1,
                 aggregation_layer_num=1,
                 fix_word_vec=False,
                 with_filter_layer=True,
                 with_highway=False,
                 word_level_MP_dim=-1,
                 sep_endpoint=False,
                 end_model_combine=False,
                 with_match_highway=False,
                 with_aggregation_highway=False,
                 highway_layer_num=1,
                 match_to_passage=True,
                 match_to_question=False,
                 match_to_choice=False,
                 with_no_match=False,
                 with_full_match=True,
                 with_maxpool_match=True,
                 with_attentive_match=True,
                 with_max_attentive_match=True,
                 use_options=False,
                 num_options=-1,
                 verbose=False,
                 matching_option=0,
                 concat_context=False,
                 tied_aggre=False,
                 rl_training_method='contrastive',
                 rl_matches=[0, 1, 2]):
        ''' Matching Options:
        0:a1=q->p, a2=c->p, [concat(a1->a2,a2->a1)]
        1:a1=q->p, a2=c->p, [a1->a2,a2->a1]
        2:[q->p,c->p]
        3:a1=p->q, a2=p->c, [a1->a2,a2->a1]
        4:[q->p,p->q,p->c]
        5:a1=q->p, a2=p->q, a3=p->c,[a3->a1,a3->a2]
        6:[p->q,p->c]
        7: Gated matching
            concat_context: Concat question & choice and feed into context LSTM
            tied_aggre: aggregation layer weights are tied.
            training_method: contrastive reward or policy gradient or soft voting

        '''
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        in_choice_repres = []
        self.question_lengths = tf.placeholder(tf.int32, [None])
        self.passage_lengths = tf.placeholder(tf.int32, [None])
        self.choice_lengths = tf.placeholder(tf.int32, [None])
        self.truth = tf.placeholder(tf.int32, [None])  # [batch_size]
        self.concat_idx_mat = None
        self.split_idx_mat_q = None
        self.split_idx_mat_c = None
        if matching_option == 7:
            self.concat_idx_mat = tf.placeholder(tf.int32, [None, None, 2])
            if concat_context:
                self.split_idx_mat_q = tf.placeholder(tf.int32,
                                                      [None, None, 2])
                self.split_idx_mat_c = tf.placeholder(tf.int32,
                                                      [None, None, 2])
        input_dim = 0
        if with_word and word_vocab is not None:
            self.in_question_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.in_choice_words = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.word_embedding = tf.get_variable("word_embedding", shape=[word_vocab.size()+1, word_vocab.word_dim], initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            print('!!!shape=', word_vocab.word_vecs.shape)
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(word_vocab.word_vecs),
                    dtype=tf.float32)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_choice_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_choice_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)
            in_choice_repres.append(in_choice_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_shape = tf.shape(self.in_choice_words)
            choice_len = input_shape[1]
            input_dim += word_vocab.word_dim

        if with_POS and POS_vocab is not None:
            self.in_question_POSs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_POSs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.POS_embedding = tf.get_variable("POS_embedding", shape=[POS_vocab.size()+1, POS_vocab.word_dim], initializer=tf.constant(POS_vocab.word_vecs), dtype=tf.float32)
            self.POS_embedding = tf.get_variable("POS_embedding",
                                                 initializer=tf.constant(
                                                     POS_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_question_POSs)  # [batch_size, question_len, POS_dim]
            in_passage_POS_repres = tf.nn.embedding_lookup(
                self.POS_embedding,
                self.in_passage_POSs)  # [batch_size, passage_len, POS_dim]
            in_question_repres.append(in_question_POS_repres)
            in_passage_repres.append(in_passage_POS_repres)

            input_shape = tf.shape(self.in_question_POSs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_POSs)
            passage_len = input_shape[1]
            input_dim += POS_vocab.word_dim

        if with_NER and NER_vocab is not None:
            self.in_question_NERs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.in_passage_NERs = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            #             self.NER_embedding = tf.get_variable("NER_embedding", shape=[NER_vocab.size()+1, NER_vocab.word_dim], initializer=tf.constant(NER_vocab.word_vecs), dtype=tf.float32)
            self.NER_embedding = tf.get_variable("NER_embedding",
                                                 initializer=tf.constant(
                                                     NER_vocab.word_vecs),
                                                 dtype=tf.float32)

            in_question_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_question_NERs)  # [batch_size, question_len, NER_dim]
            in_passage_NER_repres = tf.nn.embedding_lookup(
                self.NER_embedding,
                self.in_passage_NERs)  # [batch_size, passage_len, NER_dim]
            in_question_repres.append(in_question_NER_repres)
            in_passage_repres.append(in_passage_NER_repres)

            input_shape = tf.shape(self.in_question_NERs)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_NERs)
            passage_len = input_shape[1]
            input_dim += NER_vocab.word_dim

        if with_char and char_vocab is not None:
            self.question_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, question_len]
            self.passage_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.choice_char_lengths = tf.placeholder(
                tf.int32, [None, None])  # [batch_size, passage_len]
            self.in_question_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, question_len, q_char_len]
            self.in_passage_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, passage_len, p_char_len]
            self.in_choice_chars = tf.placeholder(
                tf.int32,
                [None, None, None])  # [batch_size, passage_len, p_char_len]
            input_shape = tf.shape(self.in_question_chars)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            input_shape = tf.shape(self.in_choice_chars)
            choice_len = input_shape[1]
            c_char_len = input_shape[2]

            char_dim = char_vocab.word_dim

            #             self.char_embedding = tf.get_variable("char_embedding", shape=[char_vocab.size()+1, char_vocab.word_dim], initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)
            self.char_embedding = tf.get_variable("char_embedding",
                                                  initializer=tf.constant(
                                                      char_vocab.word_vecs),
                                                  dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_question_chars
            )  # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(
                in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths,
                                               [-1])
            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            in_choice_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_choice_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_choice_char_repres = tf.reshape(
                in_choice_char_repres, shape=[-1, c_char_len, char_dim])
            choice_char_lengths = tf.reshape(self.choice_char_lengths, [-1])

            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(char_lstm_dim)
                # dropout
                if is_training:
                    char_lstm_cell = tf.contrib.rnn.DropoutWrapper(
                        char_lstm_cell, output_keep_prob=(1 - dropout_rate))
                char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell])

                # question_representation
                question_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_question_char_repres,
                    sequence_length=question_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                question_char_outputs = question_char_outputs[:, -1, :]
                question_char_outputs = tf.reshape(
                    question_char_outputs,
                    [batch_size, question_len, char_lstm_dim])

                tf.get_variable_scope().reuse_variables()
                # passage representation
                passage_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = passage_char_outputs[:, -1, :]
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, char_lstm_dim])

                tf.get_variable_scope().reuse_variables()
                # choice representation
                choice_char_outputs = my_rnn.dynamic_rnn(
                    char_lstm_cell,
                    in_choice_char_repres,
                    sequence_length=choice_char_lengths,
                    dtype=tf.float32
                )[0]  # [batch_size*question_len, q_char_len, char_lstm_dim]
                choice_char_outputs = choice_char_outputs[:, -1, :]
                choice_char_outputs = tf.reshape(
                    choice_char_outputs,
                    [batch_size, choice_len, char_lstm_dim])

            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)
            in_choice_repres.append(choice_char_outputs)

            input_dim += char_lstm_dim

        in_question_repres = tf.concat(in_question_repres,
                                       2)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(in_passage_repres,
                                      2)  # [batch_size, passage_len, dim]
        in_choice_repres = tf.concat(in_choice_repres,
                                     2)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - dropout_rate))
            in_choice_repres = tf.nn.dropout(in_choice_repres,
                                             (1 - dropout_rate))
        else:
            in_question_repres = tf.multiply(in_question_repres,
                                             (1 - dropout_rate))
            in_passage_repres = tf.multiply(in_passage_repres,
                                            (1 - dropout_rate))
            in_choice_repres = tf.multiply(in_choice_repres,
                                           (1 - dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths,
                                passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]
        choice_mask = tf.sequence_mask(
            self.choice_lengths, choice_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_choice_repres = match_utils.multi_highway_layer(
                    in_choice_repres, input_dim, highway_layer_num)
        # ========Bilateral Matching=====
        if verbose:
            if matching_option == 7:
                (all_match_templates, match_dim, gate_input,
                 self.matching_vectors) = gated_trilateral_match(
                     in_question_repres,
                     in_passage_repres,
                     in_choice_repres,
                     self.question_lengths,
                     self.passage_lengths,
                     self.choice_lengths,
                     question_mask,
                     mask,
                     choice_mask,
                     self.concat_idx_mat,
                     self.split_idx_mat_q,
                     self.split_idx_mat_c,
                     MP_dim,
                     input_dim,
                     context_layer_num,
                     context_lstm_dim,
                     is_training,
                     dropout_rate,
                     with_match_highway,
                     aggregation_layer_num,
                     aggregation_lstm_dim,
                     highway_layer_num,
                     with_aggregation_highway,
                     with_full_match,
                     with_maxpool_match,
                     with_attentive_match,
                     with_max_attentive_match,
                     concat_context,
                     tied_aggre,
                     rl_matches,
                     debug=True)
            else:
                (match_representation, match_dim,
                 self.matching_vectors) = match_utils.trilateral_match(
                     in_question_repres,
                     in_passage_repres,
                     in_choice_repres,
                     self.question_lengths,
                     self.passage_lengths,
                     self.choice_lengths,
                     question_mask,
                     mask,
                     choice_mask,
                     MP_dim,
                     input_dim,
                     context_layer_num,
                     context_lstm_dim,
                     is_training,
                     dropout_rate,
                     with_match_highway,
                     aggregation_layer_num,
                     aggregation_lstm_dim,
                     highway_layer_num,
                     with_aggregation_highway,
                     with_full_match,
                     with_maxpool_match,
                     with_attentive_match,
                     with_max_attentive_match,
                     match_to_passage,
                     match_to_question,
                     match_to_choice,
                     with_no_match,
                     debug=True,
                     matching_option=matching_option)
        else:
            if matching_option == 7:
                (all_match_templates, match_dim,
                 gate_input) = gated_trilateral_match(
                     in_question_repres, in_passage_repres, in_choice_repres,
                     self.question_lengths, self.passage_lengths,
                     self.choice_lengths, question_mask, mask, choice_mask,
                     self.concat_idx_mat, self.split_idx_mat_q,
                     self.split_idx_mat_c, MP_dim, input_dim,
                     context_layer_num, context_lstm_dim, is_training,
                     dropout_rate, with_match_highway, aggregation_layer_num,
                     aggregation_lstm_dim, highway_layer_num,
                     with_aggregation_highway, with_full_match,
                     with_maxpool_match, with_attentive_match,
                     with_max_attentive_match, concat_context, tied_aggre,
                     rl_matches)
            else:
                (match_representation,
                 match_dim) = match_utils.trilateral_match(
                     in_question_repres,
                     in_passage_repres,
                     in_choice_repres,
                     self.question_lengths,
                     self.passage_lengths,
                     self.choice_lengths,
                     question_mask,
                     mask,
                     choice_mask,
                     MP_dim,
                     input_dim,
                     context_layer_num,
                     context_lstm_dim,
                     is_training,
                     dropout_rate,
                     with_match_highway,
                     aggregation_layer_num,
                     aggregation_lstm_dim,
                     highway_layer_num,
                     with_aggregation_highway,
                     with_full_match,
                     with_maxpool_match,
                     with_attentive_match,
                     with_max_attentive_match,
                     match_to_passage,
                     match_to_question,
                     match_to_choice,
                     with_no_match,
                     matching_option=matching_option)
        if matching_option == 7:
            with tf.variable_scope('rl_decision_gate'):
                if use_options:
                    gate_input = gate_input[::num_options, :]
                w_gate = tf.get_variable(
                    'w_gate', [2 * context_lstm_dim,
                               len(rl_matches)],
                    dtype=tf.float32)
                b_gate = tf.get_variable('b_gate', [len(rl_matches)],
                                         dtype=tf.float32)
                gate_logits = tf.matmul(gate_input, w_gate) + b_gate

                gate_prob = tf.nn.softmax(gate_logits)

                gate_log_prob = tf.nn.log_softmax(gate_logits)

        print('check: match_dim=', match_dim)
        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)

        if use_options:
            w_1 = tf.get_variable("w_1", [match_dim / 2, 1], dtype=tf.float32)
            b_1 = tf.get_variable("b_1", [1], dtype=tf.float32)
        else:
            w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes],
                                  dtype=tf.float32)
            b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        if matching_option == 7:
            sliced_gate_probs = tf.split(gate_prob, len(rl_matches), axis=1)
            sliced_gate_log_probs = tf.split(gate_log_prob,
                                             len(rl_matches),
                                             axis=1)
            # if use_options:
            #     tile_times=tf.constant([1,num_options])
            # else:
            #     tile_times=tf.constant([1,num_classes])
            weighted_probs = []
            weighted_log_probs = []
            for mid, matcher in enumerate(all_match_templates):

                matcher.add_softmax_pred(w_0, b_0, w_1, b_1, is_training,
                                         dropout_rate, use_options,
                                         num_options)
                weighted_probs.append(
                    tf.multiply(matcher.prob, sliced_gate_probs[mid]))
                weighted_log_probs.append(
                    tf.add(matcher.log_prob, sliced_gate_log_probs[mid]))

            if verbose:
                self.all_probs = tf.stack(weighted_probs, axis=0)

            self.prob = tf.add_n(weighted_probs)
            if use_options:
                gold_matrix = tf.reshape(self.truth, [-1, num_options])
                gold_matrix = tf.cast(gold_matrix, tf.float32)
                correct = tf.equal(tf.argmax(self.prob, 1),
                                   tf.argmax(gold_matrix, 1))
            else:
                gold_matrix = tf.one_hot(self.truth,
                                         num_classes,
                                         dtype=tf.float32)
                #         gold_matrix = tf.one_hot(self.truth, num_classes)

                correct = tf.nn.in_top_k(logits, self.truth, 1)
            self.correct = correct
            self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
            self.predictions = tf.arg_max(self.prob, 1)

            if rl_training_method == 'soft_voting':
                stacked_log_prob = tf.stack(weighted_log_probs, axis=2)
                self.log_prob = tf.reduce_logsumexp(stacked_log_prob, axis=2)
                self.loss = tf.reduce_mean(
                    tf.multiply(gold_matrix, self.log_prob))
            elif rl_training_method == 'contrastive':
                weighted_log_probs = tf.stack(weighted_log_probs, axis=0)
                weighted_probs = tf.stack(weighted_probs, axis=0)
                reward_matrix = gold_matrix
                baseline = tf.reduce_sum(tf.multiply(weighted_probs,
                                                     reward_matrix),
                                         axis=[0, 2],
                                         keep_dims=True)
                log_coeffs = tf.multiply(weighted_probs,
                                         reward_matrix - baseline)
                log_coeffs = tf.stop_gradient(log_coeffs)
                self.loss = tf.negative(
                    tf.reduce_sum(tf.multiply(weighted_log_probs, log_coeffs)))

        else:

            logits = tf.matmul(match_representation, w_0) + b_0
            logits = tf.tanh(logits)
            if is_training:
                logits = tf.nn.dropout(logits, (1 - dropout_rate))
            else:
                logits = tf.multiply(logits, (1 - dropout_rate))
            logits = tf.matmul(logits, w_1) + b_1

            self.final_logits = logits
            if use_options:
                logits = tf.reshape(logits, [-1, num_options])
                gold_matrix = tf.reshape(self.truth, [-1, num_options])

                self.prob = tf.nn.softmax(logits)

                #         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example')
                #         self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy')

                # gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
                #         gold_matrix = tf.one_hot(self.truth, num_classes)
                self.loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, labels=gold_matrix))

                # correct = tf.nn.in_top_k(logits, self.truth, 1)
                # self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
                correct = tf.equal(tf.argmax(logits, 1),
                                   tf.argmax(gold_matrix, 1))
                self.correct = correct

            else:
                self.prob = tf.nn.softmax(logits)

                #         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, tf.cast(self.truth, tf.int64), name='cross_entropy_per_example')
                #         self.loss = tf.reduce_mean(cross_entropy, name='cross_entropy')

                gold_matrix = tf.one_hot(self.truth,
                                         num_classes,
                                         dtype=tf.float32)
                #         gold_matrix = tf.one_hot(self.truth, num_classes)
                self.loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=logits, labels=gold_matrix))

                correct = tf.nn.in_top_k(logits, self.truth, 1)
                self.correct = correct
            self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
            self.predictions = tf.arg_max(self.prob, 1)

        if optimize_type == 'adadelta':
            clipper = 50
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              clipper)
            self.train_op = optimizer.apply_gradients(list(zip(grads, tvars)))
        elif optimize_type == 'sgd':
            self.global_step = tf.Variable(
                0, name='global_step',
                trainable=False)  # Create a variable to track the global step.
            min_lr = 0.000001
            self._lr_rate = tf.maximum(
                min_lr,
                tf.train.exponential_decay(learning_rate, self.global_step,
                                           30000, 0.98))
            self.train_op = tf.train.GradientDescentOptimizer(
                learning_rate=self._lr_rate).minimize(self.loss)
        elif optimize_type == 'ema':
            tvars = tf.trainable_variables()
            train_op = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(self.loss)
            # Create an ExponentialMovingAverage object
            ema = tf.train.ExponentialMovingAverage(decay=0.9999)
            # Create the shadow variables, and add ops to maintain moving averages # of var0 and var1.
            maintain_averages_op = ema.apply(tvars)
            # Create an op that will update the moving averages after each training
            # step.  This is what we will use in place of the usual training op.
            with tf.control_dependencies([train_op]):
                self.train_op = tf.group(maintain_averages_op)
        elif optimize_type == 'adam':
            clipper = 50
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            tvars = tf.trainable_variables()
            l2_loss = tf.add_n(
                [tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + lambda_l2 * l2_loss
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              clipper)
            self.train_op = optimizer.apply_gradients(list(zip(grads, tvars)))

        extra_train_ops = []
        train_ops = [self.train_op] + extra_train_ops
        self.train_op = tf.group(*train_ops)
예제 #19
0
    def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None):
        options = self.options
        # ======word representation layer======
        in_question_repres = [] # word and char
        in_passage_repres = [] # word and char
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, 
                                                  initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)

            in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim
            
        if options.with_char and char_vocab is not None:
            input_shape = tf.shape(self.in_question_chars)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = char_vocab.word_dim
            self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths, [-1])
            quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32)  # [batch_size*question_len, q_char_len]
            in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1))


            in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32)  # [batch_size*passage_len, p_char_len]
            in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1))

            (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim,
                    input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False,
                    is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
            question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1)
            question_char_outputs_bw = question_char_outputs_bw[:, 0, :]
            question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw])
            question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim])

            (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim,
                    input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True,
                    is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
            passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1)
            passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :]
            passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw])
            passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim])
                
            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)

            input_dim += 2*options.char_lstm_dim

        in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] # concat word and char
        in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] # concat word and char

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len]
        question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num)

        # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1))
        # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1))

        # ========Bilateral Matching=====
        (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres,
                        self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options)

        #========Prediction Layer=========
        # match_dim = 4 * self.options.aggregation_lstm_dim
        w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.tanh(logits)
        if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        
        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        self.predictions = tf.argmax(self.prob, 1)

        if not is_training: return

        tvars = tf.trainable_variables()
        if self.options.lambda_l2>0.0:
            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + self.options.lambda_l2 * l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
예제 #20
0
    def _build(self, in_passage_words, passage_lengths, in_question_words_soft,
               question_lengths, truth):
        """ truth: a int in [0 .. num_classes] indicating entailment
        """
        num_classes = self.num_classes
        word_vocab = self.word_vocab
        is_training = self.is_training
        global_step = self.global_step
        options = self.options
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(word_vocab.word_vecs),
                    dtype=tf.float32)

            #in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, in_question_words_soft) # [batch_size, question_len, word_dim]
            in_question_word_repres = tx.utils.soft_sequence_embedding(
                self.word_embedding, in_question_words_soft)
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(in_question_words_soft)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        in_question_repres = tf.concat(
            axis=2,
            values=in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            axis=2, values=in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        mask = tf.sequence_mask(passage_lengths, passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1))
        # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1))

        # ========Bilateral Matching=====
        (match_representation,
         match_dim) = match_utils.bilateral_match_func(in_question_repres,
                                                       in_passage_repres,
                                                       question_lengths,
                                                       passage_lengths,
                                                       question_mask,
                                                       mask,
                                                       input_dim,
                                                       is_training,
                                                       options=options)

        #========Prediction Layer=========
        # match_dim = 4 * self.options.aggregation_lstm_dim
        w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.tanh(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)

        gold_matrix = tf.one_hot(truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        self.predictions = tf.argmax(self.prob, 1)

        if is_training:
            tvars = tf.trainable_variables()
            if self.options.lambda_l2 > 0.0:
                l2_loss = tf.add_n([
                    tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1
                ])
                self.loss = self.loss + self.options.lambda_l2 * l2_loss

            if self.options.optimize_type == 'adadelta':
                optimizer = tf.train.AdadeltaOptimizer(
                    learning_rate=self.options.learning_rate)
            elif self.options.optimize_type == 'adam':
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.options.learning_rate)

            grads = layer_utils.compute_gradients(self.loss, tvars)
            grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                      global_step=global_step)
            # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

            if self.options.with_moving_average:
                # Track the moving averages of all trainable variables.
                MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
                variable_averages = tf.train.ExponentialMovingAverage(
                    MOVING_AVERAGE_DECAY, global_step)
                variables_averages_op = variable_averages.apply(
                    tf.trainable_variables())
                train_ops = [self.train_op, variables_averages_op]
                self.train_op = tf.group(*train_ops)

        return {
            "logits": logits,
            "prob": self.prob,
            "loss": self.loss,
            "correct": correct,
            "eval_correct": self.eval_correct,
            "predictions": self.predictions,
        }