def build_train_proc(self):
        # input layer (batch_size, n_steps, input_dim)
        self.input_q = tf.placeholder(
            tf.float32, [None, self.max_n_q_words, self.input_ques_dim])
        self.input_q_len = tf.placeholder(tf.int32, [None])
        self.input_x = tf.placeholder(
            tf.float32, [None, self.input_n_frames, self.input_frame_dim])
        self.input_x_len = tf.placeholder(tf.int32, [None])
        self.y = tf.placeholder(tf.int32, [None, self.max_n_a_words])
        self.y_mask = tf.placeholder(tf.float32, [None, self.max_n_a_words])
        self.ans_vec = tf.placeholder(
            tf.float32, [None, self.max_n_a_words, self.input_ques_dim])
        self.batch_size = tf.placeholder(tf.int32, [])
        self.is_training = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])

        self.Wsi = tf.get_variable(
            'Wsi',
            shape=[self.input_frame_dim, self.ref_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        self.Wsh = tf.get_variable(
            'Wsh',
            shape=[self.lstm_dim, self.ref_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        self.Wsq = tf.get_variable(
            'Wsq',
            shape=[self.lstm_dim, self.ref_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        self.bias = tf.get_variable(
            'bias',
            shape=[self.ref_dim],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())
        self.Vs = tf.get_variable(
            'Vs',
            shape=[self.ref_dim, 1],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer())

        self.input_x = tf.contrib.layers.dropout(self.input_x,
                                                 self.dropout_prob,
                                                 is_training=self.is_training)

        # Question LSTM layer, [n_steps * (batch_size, input_dim)] -> [n_steps * (batch_size, 2*lstm_dim)]
        self.input_q = tf.contrib.layers.dropout(self.input_q,
                                                 self.dropout_prob,
                                                 is_training=self.is_training)
        q_lstm_output, q_lstm_state = layers.dynamic_origin_lstm_layer(
            self.input_q, self.lstm_dim, 'q_lstm', input_len=self.input_q_len)
        self.q_last_state = tf.contrib.layers.dropout(
            q_lstm_state[1], self.dropout_prob, is_training=self.is_training)

        cell = tf.contrib.rnn.BasicLSTMCell(self.lstm_dim,
                                            forget_bias=0.0,
                                            state_is_tuple=True)
        state = cell.zero_state(self.batch_size, tf.float32)

        cell_second = tf.contrib.rnn.BasicLSTMCell(self.second_lstm_dim,
                                                   forget_bias=0.0,
                                                   state_is_tuple=True)
        state_second = cell_second.zero_state(self.batch_size, tf.float32)

        img_first_outputs = []
        img_second_outputs = []
        mask_output = []
        cond_list = []

        with tf.variable_scope("img_first_layer"):
            for time_step in range(self.input_n_frames - 1):
                if time_step > 0: tf.get_variable_scope().reuse_variables()

                (cell_output, state) = cell(self.input_x[:, time_step, :],
                                            state)

                ref = tf.matmul(state[1], self.Wsh) + tf.matmul(
                    self.input_x[:, time_step + 1, :], self.Wsi) + self.bias
                condition = tf.sigmoid(tf.matmul(ref, self.Vs))
                prod = tf.squeeze(condition, 1) > 0.3
                cond_list.append(condition)

                state = (tf.where(prod, state[0], tf.zeros_like(state[0])),
                         tf.where(prod, state[1], tf.zeros_like(state[1])))
                img_first_outputs.append(cell_output)

                with tf.variable_scope("img_second_layer"):
                    (cell_output_second_tmp,
                     state_second_tmp) = cell_second(cell_output, state_second)
                cell_output_second = tf.where(
                    prod, tf.zeros_like(cell_output_second_tmp),
                    cell_output_second_tmp)
                img_second_outputs.append(cell_output_second)
                state_second = (tf.where(prod, state_second[0],
                                         state_second_tmp[0]),
                                tf.where(prod, state_second[1],
                                         state_second_tmp[1]))

                mask_value = tf.expand_dims(tf.to_float(prod), 1)
                mask_output.append(mask_value)

        mask_output = tf.concat(mask_output, 1)

        self.v_first_lstm_output = tf.reshape(
            tf.concat(img_first_outputs,
                      1), [-1, self.input_n_frames - 1, self.lstm_dim])
        self.v_second_lstm_output = tf.reshape(
            tf.concat(img_second_outputs, 1),
            [-1, self.input_n_frames - 1, self.lstm_dim])
        self.v_first_lstm_output = tf.contrib.layers.dropout(
            self.v_first_lstm_output,
            self.dropout_prob,
            is_training=self.is_training)

        self.v_first_attention_output, self.first_attention_score = layers.matrix_attention_layer(
            self.v_first_lstm_output, self.q_last_state, self.attention_dim,
            'v_first_attention')
        self.v_second_attention_output, self.second_attention_score = layers.mask_matrix_attention_layer(
            self.v_second_lstm_output, self.q_last_state, self.attention_dim,
            mask_output, 'v_second_attention')

        concat_output = tf.concat([
            self.q_last_state, self.v_first_attention_output,
            self.v_second_attention_output
        ],
                                  axis=1)

        # decoder

        # output -> first_atten
        # self.decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.decode_dim)
        self.decoder_cell = tf.contrib.rnn.GRUCell(self.decode_dim)

        with tf.variable_scope('linear'):
            decoder_input_W = tf.get_variable(
                'w',
                shape=[concat_output.shape[1], self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            decoder_input_b = tf.get_variable(
                'b',
                shape=[self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            self.decoder_input = tf.matmul(
                concat_output,
                decoder_input_W) + decoder_input_b  # [None, decode_dim]

        # answer->word predict
        self.embed_word_W = tf.Variable(tf.random_uniform(
            [self.decode_dim, self.n_words], -0.1, 0.1),
                                        name='embed_word_W')
        self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1,
                                                          0.1),
                                        name='embed_word_b')

        # word dim -> decode_dim
        self.word_to_lstm_w = tf.Variable(tf.random_uniform(
            [self.input_ques_dim, self.decode_dim], -0.1, 0.1),
                                          name='word_to_lstm_W')
        self.word_to_lstm_b = tf.Variable(tf.random_uniform([self.decode_dim],
                                                            -0.1, 0.1),
                                          name='word_to_lstm_b')

        # decoder attention layer
        with tf.variable_scope('decoder_attention'):
            self.attention_w_q = tf.get_variable(
                'attention_w_q',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_x = tf.get_variable(
                'attention_w_x',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_h = tf.get_variable(
                'attention_w_h',
                shape=[self.decode_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_b = tf.get_variable(
                'attention_b',
                shape=[self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_a = tf.get_variable(
                'attention_a',
                shape=[self.attention_dim, 1],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_to_decoder = tf.get_variable(
                'attention_to_decoder',
                shape=[self.lstm_dim, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
        # decoder
        with tf.variable_scope('decoder'):
            self.decoder_r = tf.get_variable(
                'decoder_r',
                shape=[self.decode_dim * 3, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_z = tf.get_variable(
                'decoder_z',
                shape=[self.decode_dim * 3, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_w = tf.get_variable(
                'decoder_w',
                shape=[self.decode_dim * 3, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())

        # embedding layer
        embeddings = load_file(self.params['word_embedding'])
        self.Wemb = tf.constant(embeddings, dtype=tf.float32)

        # generate training
        answer_train, train_loss = self.generate_answer_on_training()
        answer_test, test_loss = self.generate_answer_on_testing()

        # final
        variables = tf.trainable_variables()
        regularization_cost = tf.reduce_sum(
            [tf.nn.l2_loss(v) for v in variables])
        self.answer_word_train = answer_train
        self.train_loss = train_loss + self.regularization_beta * regularization_cost

        self.answer_word_test = answer_test
        self.test_loss = test_loss + self.regularization_beta * regularization_cost
        tf.summary.scalar('training cross entropy', self.train_loss)
    def build_train_proc(self):
        # input layer (batch_size, n_steps, input_dim)
        self.input_q = tf.placeholder(
            tf.float32, [None, self.max_n_q_words, self.input_ques_dim])
        self.input_q_len = tf.placeholder(tf.int32, [None])
        self.input_x = tf.placeholder(
            tf.float32, [None, self.input_n_frames, self.input_frame_dim])
        self.input_x_len = tf.placeholder(tf.int32, [None])
        self.y = tf.placeholder(tf.int32, [None, self.max_n_a_words])
        self.y_mask = tf.placeholder(tf.float32, [None, self.max_n_a_words])
        self.ans_vec = tf.placeholder(
            tf.float32, [None, self.max_n_a_words, self.input_ques_dim])
        self.batch_size = tf.placeholder(tf.int32, [])
        self.is_training = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])

        # video LSTM layer, [n_steps * (batch_size, input_dim)] -> [n_steps * (batch_size, 2*lstm_dim)]
        input_x = tf.contrib.layers.dropout(self.input_x,
                                            self.dropout_prob,
                                            is_training=self.is_training)
        v_lstm_output, _ = layers.dynamic_origin_lstm_layer(
            input_x, self.lstm_dim, 'v_lstm', input_len=self.input_x_len)
        v_lstm_output = tf.contrib.layers.dropout(v_lstm_output,
                                                  self.dropout_prob,
                                                  is_training=self.is_training)

        # Question LSTM layer, [n_steps * (batch_size, input_dim)] -> [n_steps * (batch_size, 2*lstm_dim)]
        input_q = tf.contrib.layers.dropout(self.input_q,
                                            self.dropout_prob,
                                            is_training=self.is_training)
        q_lstm_output, q_lstm_state = layers.dynamic_origin_lstm_layer(
            input_q, self.lstm_dim, 'q_lstm', input_len=self.input_q_len)
        q_lstm_output = tf.contrib.layers.dropout(q_lstm_output,
                                                  self.dropout_prob,
                                                  is_training=self.is_training)
        q_last_state = tf.contrib.layers.dropout(q_lstm_state[1],
                                                 self.dropout_prob,
                                                 is_training=self.is_training)

        # local attention layer (batch_size, max_q_n_words, q_dim) , [n_steps * (batch_size, 2*lstm_dim)] -> [batch_size, 2*lstm_dim]
        v_first_attention_output, first_attention_score_list = layers.collective_matrix_attention_layer(
            v_lstm_output,
            q_lstm_output,
            self.attention_dim,
            'v_first_local_attention',
            context_len=self.input_q_len,
            use_maxpooling=False)
        v_global_attention_output, first_attention_score = layers.matrix_attention_layer(
            v_lstm_output, q_last_state, self.attention_dim,
            'v_global_attention')

        # video attention lstm
        v_input_att = tf.contrib.layers.dropout(v_first_attention_output,
                                                self.dropout_prob,
                                                is_training=self.is_training)
        v_att_lstm_output, _ = layers.dynamic_origin_lstm_layer(
            v_input_att,
            self.lstm_dim,
            'v_att_lstm',
            input_len=self.input_q_len)
        v_att_lstm_output = tf.contrib.layers.dropout(
            v_att_lstm_output, self.dropout_prob, is_training=self.is_training)

        #att_last_state = tf.contrib.layers.dropout(att_lstm_state[1], self.dropout_prob, is_training=self.is_training)

        # second attention (batch_size, input_video_dim)
        v_second_attention_output, second_attention_score = layers.matrix_attention_layer(
            v_att_lstm_output, q_last_state, self.attention_dim,
            'v_second_local_attention')

        self.attention = tf.reduce_sum(
            tf.multiply(first_attention_score_list,
                        tf.expand_dims(second_attention_score, 2)), 1)

        # dot product
        #qv_dot = tf.multiply(q_last_state, v_last_state)

        # concatenation
        concat_output = tf.concat([
            q_last_state, v_global_attention_output, v_second_attention_output
        ],
                                  axis=1)
        self.v_first_lstm_output = v_lstm_output
        self.q_last_state = q_last_state
        print(self.v_first_lstm_output.shape)

        # decoder

        # output -> first_atten
        # self.decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.decode_dim)
        self.decoder_cell = tf.contrib.rnn.GRUCell(self.decode_dim)

        with tf.variable_scope('linear'):
            decoder_input_W = tf.get_variable(
                'w',
                shape=[concat_output.shape[1], self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            decoder_input_b = tf.get_variable(
                'b',
                shape=[self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            self.decoder_input = tf.matmul(
                concat_output,
                decoder_input_W) + decoder_input_b  # [None, decode_dim]

        # answer->word predict
        self.embed_word_W = tf.Variable(tf.random_uniform(
            [self.decode_dim, self.n_words], -0.1, 0.1),
                                        name='embed_word_W')
        self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1,
                                                          0.1),
                                        name='embed_word_b')

        # word dim -> decode_dim
        self.word_to_lstm_w = tf.Variable(tf.random_uniform(
            [self.input_ques_dim, self.decode_dim], -0.1, 0.1),
                                          name='word_to_lstm_W')
        self.word_to_lstm_b = tf.Variable(tf.random_uniform([self.decode_dim],
                                                            -0.1, 0.1),
                                          name='word_to_lstm_b')

        # decoder attention layer
        with tf.variable_scope('decoder_attention'):
            self.attention_w_q = tf.get_variable(
                'attention_w_q',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_x = tf.get_variable(
                'attention_w_x',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_h = tf.get_variable(
                'attention_w_h',
                shape=[self.decode_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_b = tf.get_variable(
                'attention_b',
                shape=[self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_a = tf.get_variable(
                'attention_a',
                shape=[self.attention_dim, 1],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_to_decoder = tf.get_variable(
                'attention_to_decoder',
                shape=[self.lstm_dim, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
        # decoder
        with tf.variable_scope('decoder'):
            self.decoder_r = tf.get_variable(
                'decoder_r',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_z = tf.get_variable(
                'decoder_z',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_w = tf.get_variable(
                'decoder_w',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())

        # embedding layer
        embeddings = load_file(self.params['word_embedding'])
        self.Wemb = tf.constant(embeddings, dtype=tf.float32)

        # generate training
        answer_train, train_loss = self.generate_answer_on_training()
        answer_test, test_loss = self.generate_answer_on_testing()

        # final
        variables = tf.trainable_variables()
        regularization_cost = tf.reduce_sum(
            [tf.nn.l2_loss(v) for v in variables])
        self.answer_word_train = answer_train
        self.train_loss = train_loss + self.regularization_beta * regularization_cost

        self.answer_word_test = answer_test
        self.test_loss = test_loss + self.regularization_beta * regularization_cost
        tf.summary.scalar('training cross entropy', self.train_loss)
    def build_train_proc(self):
        # input layer (batch_size, n_steps, input_dim)
        self.input_q = tf.placeholder(
            tf.float32, [None, self.max_n_q_words, self.input_ques_dim])
        self.input_q_len = tf.placeholder(tf.int32, [None])
        self.input_x = tf.placeholder(
            tf.float32, [None, self.input_n_frames, self.input_frame_dim])
        self.input_x_len = tf.placeholder(tf.int32, [None])
        self.y = tf.placeholder(tf.int32, [None, self.max_n_a_words])
        self.y_mask = tf.placeholder(tf.float32, [None, self.max_n_a_words])
        self.ans_vec = tf.placeholder(
            tf.float32, [None, self.max_n_a_words, self.input_ques_dim])
        self.batch_size = tf.placeholder(tf.int32, [])
        self.is_training = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])

        # bi-decode video
        lstm_dim = self.lstm_dim
        input_x = tf.contrib.layers.dropout(self.input_x,
                                            self.dropout_prob,
                                            is_training=self.is_training)
        v_lstm_out, v_lstm_state = layers.dynamic_origin_bilstm_layer(
            input_x, lstm_dim / 2, 'v_lstm_bi', input_len=self.input_x_len)

        # decode question
        _, q_lstm_state = layers.dynamic_origin_lstm_layer(
            self.input_q, lstm_dim, '1_lstm', input_len=self.input_q_len)
        q_last_state = tf.contrib.layers.dropout(q_lstm_state[1],
                                                 self.dropout_prob,
                                                 is_training=self.is_training)

        v_first_attention_output, _ = layers.matrix_attention_layer(
            v_lstm_out, q_last_state, self.attention_dim,
            'v_first_local_attention')

        concat_output = tf.concat([v_first_attention_output, q_last_state],
                                  axis=1)
        self.v_first_lstm_output = v_lstm_out
        self.q_last_state = q_last_state

        # decoder

        # output -> first_atten
        # self.decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.decode_dim)
        self.decoder_cell = tf.contrib.rnn.GRUCell(self.decode_dim)

        with tf.variable_scope('linear'):
            decoder_input_W = tf.get_variable(
                'w',
                shape=[concat_output.shape[1], self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            decoder_input_b = tf.get_variable(
                'b',
                shape=[self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(
                ))  # initializer=tf.random_normal_initializer(stddev=0.03))
            self.decoder_input = tf.matmul(
                concat_output,
                decoder_input_W) + decoder_input_b  # [None, decode_dim]

        # answer->word predict
        self.embed_word_W = tf.Variable(tf.random_uniform(
            [self.decode_dim, self.n_words], -0.1, 0.1),
                                        name='embed_word_W')
        self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1,
                                                          0.1),
                                        name='embed_word_b')

        # word dim -> decode_dim
        self.word_to_lstm_w = tf.Variable(tf.random_uniform(
            [self.input_ques_dim, self.decode_dim], -0.1, 0.1),
                                          name='word_to_lstm_W')
        self.word_to_lstm_b = tf.Variable(tf.random_uniform([self.decode_dim],
                                                            -0.1, 0.1),
                                          name='word_to_lstm_b')

        # decoder attention layer
        with tf.variable_scope('decoder_attention'):
            self.attention_w_q = tf.get_variable(
                'attention_w_q',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_x = tf.get_variable(
                'attention_w_x',
                shape=[self.lstm_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_w_h = tf.get_variable(
                'attention_w_h',
                shape=[self.decode_dim, self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_b = tf.get_variable(
                'attention_b',
                shape=[self.attention_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_a = tf.get_variable(
                'attention_a',
                shape=[self.attention_dim, 1],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.attention_to_decoder = tf.get_variable(
                'attention_to_decoder',
                shape=[self.lstm_dim, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
        # decoder
        with tf.variable_scope('decoder'):
            self.decoder_r = tf.get_variable(
                'decoder_r',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_z = tf.get_variable(
                'decoder_z',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            self.decoder_w = tf.get_variable(
                'decoder_w',
                shape=[self.decode_dim * 2, self.decode_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())

        # embedding layer
        embeddings = load_file(self.params['word_embedding'])
        self.Wemb = tf.constant(embeddings, dtype=tf.float32)

        # generate training
        answer_train, train_loss = self.generate_answer_on_training()
        answer_test, test_loss = self.generate_answer_on_testing()

        # final
        variables = tf.trainable_variables()
        regularization_cost = tf.reduce_sum(
            [tf.nn.l2_loss(v) for v in variables])
        self.answer_word_train = answer_train
        self.train_loss = train_loss + self.regularization_beta * regularization_cost

        self.answer_word_test = answer_test
        self.test_loss = test_loss + self.regularization_beta * regularization_cost
        tf.summary.scalar('training cross entropy', self.train_loss)