示例#1
0
    def generate(self):
        inputs = tf.split(
            1, self.args.seq_length,
            tf.nn.embedding_lookup(self.embedding, self.input_data))
        inputs = map(lambda i: tf.nn.l2_normalize(i, 1),
                     [tf.squeeze(input_, [1]) for input_ in inputs])

        def loop(prev, i):
            return prev

        with tf.variable_scope('GEN', reuse=self.has_init_seq2seq) as scope:
            self.has_init_seq2seq = True
            if self.args.num_layers == 1:
                outputs, last_state = seq2seq.rnn_decoder(
                    inputs, [self.initial_state1],
                    self.cell,
                    loop_function=loop,
                    scope=scope)
            elif self.args.num_layers == 2:
                outputs, last_state = seq2seq.rnn_decoder(
                    inputs, [self.initial_state1, self.initial_state2],
                    self.cell,
                    loop_function=loop,
                    scope=scope)
            else:
                raise Exception(
                    'Unsupported number of layers. Use 1 or 2 layers for now..'
                )
            outputs = map(lambda o: tf.nn.l2_normalize(o, 1), outputs)
        self.outputs = outputs
        return outputs
 def rnn_decode(self, cell, enc_memory):
     dec_inp = (tf.unstack(
         tf.zeros([self.seq_len, self.batch_size, self.feat_dim],
                  dtype=tf.float32,
                  name="GO")))
     with tf.variable_scope("stack_rnn_decoder"):
         dec_cell = copy.deepcopy(cell)
         dec_output, dec_state = seq2seq.rnn_decoder(
             dec_inp, enc_memory, dec_cell)
         for i in range(2, self.stack_num):
             with tf.variable_scope("stack_rnn_decoder_" + str(i)):
                 dec_cell = copy.deepcopy(cell)
                 dec_output, dec_state = core_rnn.static_rnn(
                     dec_cell, dec_output, dtype=dtypes.float32)
         dec_reshape = tf.transpose(
             tf.reshape(dec_output,
                        (self.seq_len * self.batch_size,
                         self.p_memory_dim + self.s_memory_dim)))
         W_p = tf.get_variable(
             "output_proj_w",
             [self.feat_dim, self.p_memory_dim + self.s_memory_dim])
         b_p = tf.get_variable("output_proj_b",
                               shape=(self.feat_dim),
                               initializer=tf.constant_initializer(0.0))
         b_p = [b_p for i in range(self.seq_len * self.batch_size)]
         b_p = tf.transpose(b_p)
         dec_proj_outputs = tf.matmul(W_p, dec_reshape) + b_p
     return dec_proj_outputs
示例#3
0
def basic_rnn_seq2seq_with_bottle_memory(encoder_inputs,
                                         decoder_inputs,
                                         cell,
                                         dtype=dtypes.float32,
                                         scope=None):
    """Basic RNN sequence-to-sequence model. 

    Args:
      encoder_inputs: A list of 2D Tensors [batch_size x input_size]
      decoder_inputs: A list of 2D Tensors [batch_size x input_size]
      cell: core_rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype of the initial state of the RNN cell (default:
        tf.float32).
      scope: VariableScope for the created subgraph; default: "rnn_seq2seq_BN"

    Returns:
      
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x output_size] containing the generated outputs.

      enc_state: The state of each encoder cell in the final time-step.
          This is a 2D Tensor of shape [batch_size x cell.state_size]

      dec_state: The state of each decoder cell in the final time-step.
          This is a 2D Tensor of shape [batch_size x cell.state_size]
    """
    with variable_scope.variable_scope(scope or "basic_rnn_seq2seq"):
        _, enc_state = core_rnn.static_rnn(cell, encoder_inputs, dtype=dtype)
        outputs, dec_state = seq2seq.rnn_decoder(decoder_inputs, enc_state,
                                                 cell)

        return outputs, enc_state, dec_state
示例#4
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = core_rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = core_rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = core_rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size, state_is_tuple=True)

        self.cell = cell = core_rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) 

        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="input_data")
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="targets")
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
                print "seq_length = ", args.seq_length, "embedding_lookup = ", tf.nn.embedding_lookup(embedding, self.input_data)
                #inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = tf.split( tf.nn.embedding_lookup(embedding, self.input_data)  , args.seq_length,1)
                print "inputs 1:",inputs
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
                print "inputs 2:",inputs
        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        # yonghua
        # inputs, initial_state, cell, scope
        outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm')
        #sys.stdout.write("outputs : %s\tlast_state : %s" % (outputs, last_state))
        #output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        output = tf.reshape(tf.concat(outputs,1), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits, name="prob_results")
        loss = seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])],
                args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False,name="LR_")
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
示例#5
0
 def discriminate_wv(self, input_data_wv):
     with tf.variable_scope('DISC', reuse=self.has_init_seq2seq) as scope:
         self.has_init_seq2seq = True
         output_wv, states_wv = seq2seq.rnn_decoder(input_data_wv,
                                                    self.initial_state,
                                                    self.cell,
                                                    scope=scope)
         predicted_classes_wv = tf.matmul(output_wv[-1], self.fc_layer)
     return predicted_classes_wv
示例#6
0
def stack_rnn_seq2seq_with_bottle_memory(encoder_inputs,
                                         decoder_inputs,
                                         cell,
                                         stack_num=3,
                                         dtype=dtypes.float32,
                                         scope=None):
    """Stacking RNN seq2seq model with bottleneck.
    
    Args:
      encoder_inputs: A list of 2D Tensors [batch_size x input_size] 
      decoder_inputs: A list of 2D Tensors [batch_size x input_size]
      cell: core_rnn_cell.RNNCell defining the cell function and size.
      stack_num: the number to stack in seq2seq model 
      dtype: The dtype of the initial state of the RNN cell (default:
        tf.float32)
      
    Returns:
      outputs: A list of the same length as decoer_inputs of 2D Tensors with 
        shape [batch_size x output_size] containing the generated outputs.
      enc_state: The state of each encoder cell in the final time_step.
        This is a 2D Tensor of shape [batch_size x cell.state_size]
      dec_state: The state of each decoder cell in the final time-step.
        This is a 2D Tensor of shape [batch_size x cell.state_size]
    """
    with variable_scope.variable_scope(scope or "stack_rnn_enc_1"):
        enc_cell = copy.copy(cell)
        enc_output, enc_state = core_rnn.static_rnn(enc_cell,
                                                    encoder_inputs,
                                                    dtype=dtype)
    for i in range(2, stack_num):
        with variable_scope.variable_scope(scope
                                           or "stack_rnn_encoder_" + str(i)):
            enc_cell = copy.copy(cell)
            enc_output, enc_state = core_rnn.static_rnn(enc_cell,
                                                        enc_output,
                                                        dtype=dtype)

    with variable_scope.variable_scope(scope or "stack_rnn_dec_1"):
        dec_cell = copy.copy(cell)
        dec_output, dec_state = seq2seq.rnn_decoder(decoder_inputs, enc_state,
                                                    dec_cell)
    for i in range(2, stack_num):
        with variable_scope.variable_scope(scope
                                           or "stack_rnn_decoder_" + str(i)):
            dec_cell = copy.copy(cell)
            dec_output, dec_state = core_rnn.static_rnn(dec_cell,
                                                        dec_output,
                                                        dtype=dtype)

    return dec_output, enc_state, dec_state
示例#7
0
  def testRNNDecoder(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
        _, enc_state = rnn.static_rnn(
            rnn_cell.GRUCell(2), inp, dtype=dtypes.float32)
        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
        cell = core_rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
        dec, mem = seq2seq_lib.rnn_decoder(dec_inp, enc_state, cell)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 4), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
示例#8
0
  def testRNNDecoder(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
        _, enc_state = rnn.static_rnn(
            rnn_cell.GRUCell(2), inp, dtype=dtypes.float32)
        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
        cell = core_rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
        dec, mem = seq2seq_lib.rnn_decoder(dec_inp, enc_state, cell)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 4), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
示例#9
0
    def __call__(self, img_ph, location_network, retina_sensor,
                 glimpse_network):
        # lstm cell
        cell = BasicLSTMCell(self.hidden_size)

        # helper func for feeding glimpses to every step of lstm
        # h_t_prev: a 2D tensor of shape (B, hidden_size). The hidden state vector for the previous timestep `t-1`.
        loc_ts, mean_ts = [], []

        ## at time step t, location-->pths-->glimpse
        def loop_function(h_prev, _):
            # predict location from previous hidden state
            loc_t, mean_t = location_network(h_prev)
            loc_ts.append(loc_t)
            mean_ts.append(mean_t)

            # crop pths from image based on the predicted location
            pths_t = retina_sensor(img_ph, loc_t)

            # generate glimpse image from current pths_t and loc_t
            glimpse = glimpse_network(pths_t, loc_t)
            return glimpse

        # lstm init h_t
        init_state = cell.zero_state(self.batch_size, tf.float32)

        # lstm inputs at every step
        init_loc = tf.random_uniform((self.batch_size, self.loc_dim),
                                     minval=-1,
                                     maxval=1)
        init_pths = retina_sensor(img_ph, init_loc)
        init_glimpse = glimpse_network(init_pths, init_loc)
        rnn_inputs = [init_glimpse]
        rnn_inputs.extend([0] * self.num_glimpses)

        # get hidden state of every step from lstm
        h_ts, _ = rnn_decoder(rnn_inputs,
                              init_state,
                              cell,
                              loop_function=loop_function)

        return loc_ts, mean_ts, h_ts
示例#10
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        additional_cell_args = {}
        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        elif args.model == 'gridlstm':
            cell_fn = grid_rnn.Grid2LSTMCell
            additional_cell_args.update({
                'use_peepholes': True,
                'forget_bias': 1.0
            })
        elif args.model == 'gridgru':
            cell_fn = grid_rnn.Grid2GRUCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size, **additional_cell_args)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding",
                                            [args.vocab_size, args.rnn_size])
                inputs = tf.split(axis=1,
                                  num_or_size_splits=args.seq_length,
                                  value=tf.nn.embedding_lookup(
                                      embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope='rnnlm')
        # output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        output = tf.reshape(tf.concat(axis=1, values=outputs),
                            [-1, args.rnn_size])
        self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
示例#11
0
    def r2rtdecoder(self):
        """
        Create a mask that we will use for the cost function
        This mask is the same shape as x and y_, and is equal to 1 for all non-PAD time
        steps (where a prediction is made), and 0 for all PAD time steps (no pred -> no loss)
        The number 30, used when creating the lower_triangle_ones matrix, is the maximum
        sequence length in our dataset
        """

        lower_triangular_ones = tf.constant(np.tril(
            np.ones([self._max_length, self._max_length])),
                                            dtype=tf.float32)
        seqlen_mask = tf.slice(
            tf.gather(lower_triangular_ones, self.seqlen - 1), [0, 0],
            [self._batch_size2, self._max_length])

        # RNN
        state_size = self._emb_dim
        num_classes = self._class_num

        cell = tf.contrib.rnn.BasicRNNCell(state_size)

        init_state = tf.get_variable('init_state', [1, state_size],
                                     initializer=tf.constant_initializer(0.0))
        init_state = tf.tile(init_state, [self._batch_size2, 1])

        rnn_outputs, final_state = tf.nn.dynamic_rnn(
            cell,
            self.x_embedding,
            sequence_length=self.seqlen,
            initial_state=init_state)

        y_reshaped = tf.reshape(self.y, [-1])
        """
        decoder

        use the last step output of encoder as the input
        """
        # en_last_output = self.last_relevant(rnn_outputs, self.seqlen)
        idx = tf.range(self._batch_size2) * \
              tf.shape(rnn_outputs)[1] + (self.seqlen - 1)
        last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]),
                                    idx)

        with tf.variable_scope('decoder'):
            decoder_cell = tf.contrib.rnn.BasicRNNCell(self._emb_dim)
        dec_input = last_rnn_output
        dec_in_state = final_state
        dec_outputs = []
        with tf.variable_scope('multi_decoder') as scope:
            for id in range(self._max_length):
                if id > 0:
                    scope.reuse_variables()
                dec_output, dec_out_state = seq2seq_lib.rnn_decoder(
                    [dec_input], dec_in_state, decoder_cell)
                # variable_scope.get_variable_scope().reuse_variables()
                dec_input = dec_output[0]
                dec_in_state = dec_out_state
                dec_outputs += dec_output

        # dec_outputs: [batch_size, max_length, state_size]
        # [batch_size*maxlenth, state_size]
        dec_final_output = tf.concat(dec_outputs, axis=0)

        # Softmax layer
        # with tf.variable_scope('softmax'):
        # W = tf.get_variable('W', [state_size, num_classes])
        # b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
        # weight = tf.Variable([self._emb_dim, self._class_num])

        W = tf.Variable(
            tf.truncated_normal([self._emb_dim, self._class_num], stddev=0.01))
        b = tf.Variable(tf.constant(0.1, shape=[
            self._class_num,
        ]))
        logits = tf.matmul(dec_final_output, W) + b

        # order not the same as y with tf.concat
        l1 = tf.reshape(logits, [self._max_length, -1, self._class_num])
        l2 = tf.transpose(l1, [1, 0, 2])
        logits = tf.reshape(l2, [-1, self._class_num])

        preds = tf.nn.softmax(logits)
        final_output = tf.argmax(preds, 1)
        """
        Accuracy
        """
        # To calculate the number of correctly predicted value(we want to count
        # padded steps as incorrect)
        correct = tf.cast(tf.equal(tf.cast(final_output, tf.int32), y_reshaped), tf.int32) * \
                  tf.cast(tf.reshape(seqlen_mask, [-1]), tf.int32)
        truevalue = y_reshaped
        # To calculate accuracy we want to divide by the number of non-padded time-steps,
        # rather than taking the mean
        accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / tf.reduce_sum(
            tf.cast(self.seqlen, tf.float32))
        """
        Loss function
        """
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y_reshaped, logits=logits)
        loss = loss * tf.reshape(seqlen_mask, [-1])

        # To calculate average loss, we need to divide by number of non-padded time-steps,
        # rather than taking the mean
        loss = tf.reduce_sum(loss) / tf.reduce_sum(seqlen_mask)
        optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)

        saver = tf.train.Saver()
        """
        Training
        """
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            e_loss = []
            e_acc = []
            learning_rate = 2 * 1e-3
            for epoch in range(self._epoch_num):
                total_loss = []
                total_acc = []
                for batch in range(self._batch_num):
                    batch_X, batch_y, batch_len = self.getNextBatch(
                        self._batch_size, batch)
                    batch_size_2 = batch_X.shape[0]
                    feed = {
                        self.x_embedding: batch_X,
                        self.y: batch_y,
                        self.seqlen: batch_len,
                        self._batch_size2: batch_size_2,
                        self.learning_rate: learning_rate
                    }
                    cor, dec_out, y_re, log, acc, cost, _ = sess.run(
                        [
                            correct, dec_outputs, y_reshaped, logits, accuracy,
                            loss, optimizer
                        ],
                        feed_dict=feed)
                    total_loss.append(cost)
                    total_acc.append(acc)

                total_loss = np.sum(np.array(total_loss))
                total_acc = np.mean(np.array(total_acc))
                e_loss.append(total_loss)
                e_acc.append(total_acc)
                print("Epoch" + str(epoch) + ":")
                print("Loss: " + str(total_loss) + "  " + "Accuracy: " +
                      str(total_acc))

                if total_loss < 30:
                    learning_rate = 1e-3
                if total_loss < 15:
                    learning_rate = 1e-4
                    # print("Learning rate changed.")

                if epoch == self._epoch_num - 1 or total_loss < 0.5:  # or total_acc>0.985:
                    hidden_code = []
                    rnn_code = []
                    total_acc = []
                    for test_batch in range(self._batch_num_test):
                        if test_batch == self._batch_num_test - 1:
                            a = 1
                        batch_testX, batch_y, batch_testlen = self.getNextTestBatch(
                            self._batch_size, test_batch)
                        batch_testsize_2 = batch_testX.shape[0]

                        feed = {
                            self.x_embedding: batch_testX,
                            self.y: batch_y,
                            self.seqlen: batch_testlen,
                            self._batch_size2: batch_testsize_2,
                            self.learning_rate: learning_rate
                        }
                        last_rnno, rnno, t, f, code, acc = sess.run(
                            [
                                last_rnn_output, rnn_outputs, truevalue,
                                final_output, final_state, accuracy
                            ],
                            feed_dict=feed)
                        code = code.reshape([-1, self._emb_dim])
                        hidden_code.extend(code)
                        total_acc.append(acc)

                        # print("Batch: "+str(test_batch))
                        print("True" + str(t[0:self._max_length]))
                        print("Pred" + str(f[0:self._max_length]))
                    total_acc = np.mean(np.array(total_acc))
                    print("Accuracy:" + str(total_acc))
                    codes = np.array(hidden_code).reshape(-1, self._emb_dim)
                    df = pd.DataFrame(codes[0:len(self.testdata), :])
                    file_hidden = "toydata/covmat_hiddencode_split" + \
                                  str(self._emb_dim) + ".csv"
                    df.to_csv(file_hidden, float_format='%.5f')
                    break
                    # Save the variables to disk.
            # save_path = saver.save(sess, "savemodel/twornn3.ckpt")
            # print("Model saved in file: " + save_path)

            self.plot(np.array(e_loss), np.array(e_acc))

        return
示例#12
0
文件: put_all.py 项目: LiamLYJ/FWB
    def __init__(self,
                 img_channel,
                 img_size,
                 pth_size,
                 g_size,
                 l_size,
                 glimpse_output_size,
                 loc_dim,
                 variance,
                 cell_size,
                 num_glimpses,
                 num_classes,
                 learning_rate,
                 learning_rate_decay_factor,
                 min_learning_rate,
                 training_steps_per_epoch,
                 max_gradient_norm,
                 fc1_size,
                 base_channels,
                 output_dim,
                 is_training=False):

        self.img_ph = tf.placeholder(tf.float32,
                                     [None, img_size * img_size * img_channel])
        self.lbl_ph = tf.placeholder(tf.float32, [None, output_dim])

        self.global_step = tf.Variable(0, trainable=False)

        self.learning_rate = tf.maximum(
            tf.train.exponential_decay(learning_rate,
                                       self.global_step,
                                       training_steps_per_epoch,
                                       learning_rate_decay_factor,
                                       staircase=True), min_learning_rate)

        cell = BasicLSTMCell(cell_size)

        with tf.variable_scope('GlimpseNetwork'):
            glimpse_network = GlimpseNetwork(img_channel, img_size, pth_size,
                                             loc_dim, g_size, l_size,
                                             glimpse_output_size)
        with tf.variable_scope('Agent'):
            # the agent is resposibale for select a windows and est a gain
            with tf.variable_scope('LocationNetwork'):
                location_network = LocationNetwork(
                    loc_dim=loc_dim,
                    rnn_output_size=cell.output_size,
                    variance=variance,
                    is_sampling=is_training)
            with tf.variable_scope('WhiteBalanceNetwork'):
                wb_network = WhiteBalanceNetwork(
                    rnn_output_size=cell.output_size, output_dim=output_dim)
        if FLAGS.USE_CRITIC:
            with tf.variable_scope('Critic'):
                critic_network = CriticNetwork(fc1_size, base_channels)

        # Core Network
        batch_size = tf.shape(self.img_ph)[0]
        init_loc = tf.random_uniform((batch_size, loc_dim),
                                     minval=-1,
                                     maxval=1)
        init_state = cell.zero_state(batch_size, tf.float32)

        init_glimpse = glimpse_network(self.img_ph, init_loc)
        rnn_inputs = [init_glimpse]
        rnn_inputs.extend([0] * num_glimpses)

        locs, loc_means = [], []
        gains = []
        img_retouched = []

        def _apply_gain(ill, loc, img, patch_wise=False):
            if patch_wise:
                retina = RetinaSensor(img_channel, img_size, pth_size)
                pth = retina(img, loc, serial=False)
                img = tf.reshape(
                    img, [tf.shape(img)[0], img_size, img_size, img_channel])
                retouched_channel = []
                for i in range(3):
                    tmp = pth[:, :, :, i]
                    tmp = tf.reshape(tmp, [tf.shape(tmp)[0], -1])
                    tmp_ill = tf.reshape(ill[:, i] / ill[:, 1],
                                         [tf.shape(img)[0], 1])
                    tmp_ill = tf.tile(tmp_ill, [1, pth_size * pth_size])
                    tmp *= tmp_ill
                    retouched_channel.append(tmp)
                retouched = tf.concat(retouched_channel, -1)
                img[:,
                    round(img_size * loc[0]) -
                    pth_size:round(img_size * loc[0]) + pth_size,
                    round(img_size * loc[1]) -
                    pth_size:round(img_size * loc[1]) +
                    pth_size, :] = retouched
            else:
                img = tf.reshape(
                    img, [tf.shape(img)[0], img_size, img_size, img_channel])
                retouched_channel = []
                for i in range(3):
                    tmp = img[:, :, :, i]
                    tmp = tf.reshape(tmp, [tf.shape(tmp)[0], -1])
                    tmp_ill = tf.reshape(ill[:, i] / ill[:, 1],
                                         [tf.shape(img)[0], 1])
                    tmp_ill = tf.tile(tmp_ill, [1, img_size * img_size])
                    tmp *= tmp_ill
                    retouched_channel.append(tmp)
                img = tf.concat(retouched_channel, -1)
            return img

        def _loop_function(prev, _):
            loc, loc_mean = location_network(prev)
            locs.append(loc)
            loc_means.append(loc_mean)
            gain = wb_network(prev)
            gains.append(gain)
            if img_retouched:
                img_retouched.append(_apply_gain(gain, loc, img_retouched[-1]))
                glimpse = glimpse_network(img_retouched[-1], loc)
            else:
                img_retouched.append(_apply_gain(gain, loc, self.img_ph))
                glimpse = glimpse_network(self.img_ph, loc)
            return glimpse

        rnn_outputs, _ = rnn_decoder(rnn_inputs,
                                     init_state,
                                     cell,
                                     loop_function=_loop_function)

        assert len(gains) == len(locs)
        # Time independent baselines
        with tf.variable_scope('Baseline'):
            baseline_w = weight_variable((cell.output_size, 1))
            baseline_b = bias_variable((1, ))
        baselines = []
        for output in rnn_outputs[1:]:
            baseline = tf.nn.xw_plus_b(output, baseline_w, baseline_b)
            baseline = tf.squeeze(baseline)
            baselines.append(baseline)
        baselines = tf.stack(baselines)  # [timesteps, batch_sz]
        baselines = tf.transpose(baselines)  # [batch_sz, timesteps]

        # Classification. Take the last step only.
        rnn_last_output = rnn_outputs[-1]
        with tf.variable_scope('Classification'):
            logit_w = weight_variable((cell.output_size, num_classes))
            logit_b = bias_variable((num_classes, ))
        logits = tf.nn.xw_plus_b(rnn_last_output, logit_w, logit_b)
        # batch_size *3
        self.prediction = tf.nn.l2_normalize(logits, axis=1)
        self.locations = locs
        if is_training:
            # angular loss
            self.xent = get_angular_loss(self.prediction, self.lbl_ph)
            tf.summary.scalar('xent', self.xent)

            # RL reward
            # reward shape [batchsize, 1]
            if FLAGS.USE_CRITIC:

                img_critic = tf.reshape(self.img_ph, [
                    tf.shape(self.img_ph)[0], img_size, img_size, img_channel
                ])
                img_real = apply_gain(img_critic, self.lbl_ph)
                img_real = tf.reshape(
                    img_real,
                    [tf.shape(img_real)[0], img_size, img_size, img_channel])
                img_fake = apply_gain(img_critic, self.prediction)
                img_fake = tf.reshape(
                    img_fake,
                    [tf.shape(img_fake)[0], img_size, img_size, img_channel])

                real_logit = critic_network(img_real,
                                            is_train=is_training,
                                            reuse=False)
                fake_logit = critic_network(img_fake,
                                            is_train=is_training,
                                            reuse=True)
                rnn_fake_logits = []
                for index_sequence in range(len(img_retouched)):
                    rnn_img_fake = tf.reshape(img_retouched[index_sequence], [
                        tf.shape(img_retouched[index_sequence])[0], img_size,
                        img_size, img_channel
                    ])
                    rnn_fake_logit = critic_network(rnn_img_fake,
                                                    is_train=is_training,
                                                    reuse=True)
                    rnn_fake_logits.append(rnn_fake_logit)

                rewards = tf.stop_gradient(
                    tf.convert_to_tensor(
                        rnn_fake_logits))  # shape (timesteps, batch_sz, 1)
                rewards = tf.transpose(tf.squeeze(
                    rewards, 2))  # shape [batch_sz, timesteps]

                self.c_loss = tf.reduce_mean(fake_logit - real_logit)
                if FLAGS.grad_penalty < 0:
                    # use grad clip
                    gradients = tf.gradients(self.c_loss, theta_c)

                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.opt_c = tf.train.AdamOptimizer(
                        self.learning_rate).apply_gradients(
                            zip(clipped_gradients, params),
                            global_step=self.global_step)

                else:
                    # Critic gradient norm and penalty
                    alpha_dist = tf.contrib.distributions.Uniform(low=0.,
                                                                  high=1.)
                    alpha = alpha_dist.sample((batch_size, 1, 1, 1))
                    interpolated = img_real + alpha * (img_fake - img_real)

                    inte_logit = critic_network(images=interpolated,
                                                is_train=is_training,
                                                reuse=True)

                    gradients = tf.gradients(inte_logit, [
                        interpolated,
                    ])[0]

                    gradient_norm = tf.sqrt(
                        1e-6 + tf.reduce_sum(gradients**2, axis=[1, 2, 3]))
                    gradient_penalty = FLAGS.grad_penalty * tf.reduce_mean(
                        tf.maximum(gradient_norm - 1.0, 0.0)**2)

                    self.c_loss += gradient_penalty

                theta_c = tf.trainable_variables(scope='critic')

                gradients = tf.gradients(self.c_loss, theta_c)

                self.opt_c = tf.train.AdamOptimizer(
                    self.learning_rate).apply_gradients(
                        zip(gradients, theta_c), global_step=self.global_step)

            else:
                reward = tf.norm(self.prediction - self.lbl_ph, axis=1)
                rewards = tf.expand_dims(reward, 1)
                rewards = tf.tile(rewards,
                                  (1, num_glimpses))  # [batch_sz, timesteps]

            advantages = rewards - tf.stop_gradient(baselines)
            self.advantage = tf.reduce_mean(advantages)
            logll = log_likelihood(loc_means, locs, variance)
            logllratio = tf.reduce_mean(logll * advantages)
            self.reward = tf.reduce_mean(rewards)
            tf.summary.scalar('reward', self.reward)
            # baseline loss
            self.baselines_mse = tf.reduce_mean(
                tf.square((rewards - baselines)))
            # hybrid loss
            self.loss = -logllratio + self.xent + self.baselines_mse
            tf.summary.scalar('loss', self.loss)

            # exclude the variables in critic scope
            params_all = tf.trainable_variables()
            params = []
            for var in params_all:
                if not 'critic' in var.op.name:
                    params.append(var)
            gradients = tf.gradients(self.loss, params)

            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.train_op = tf.train.AdamOptimizer(
                self.learning_rate).apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

            img = tf.reshape(
                self.img_ph,
                [tf.shape(self.img_ph)[0], img_size, img_size, img_channel])
            tf.summary.image('input', img)
            tf.summary.image('gt', apply_gain(img, self.lbl_ph))
            tf.summary.image('est', apply_gain(img, self.prediction))
            self.sum_total = tf.summary.merge_all()

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=99999999)
    def r2rtdecoder(self):
        """
        Create a mask that we will use for the cost function

        This mask is the same shape as x and y_, and is equal to 1 for all non-PAD time
        steps (where a prediction is made), and 0 for all PAD time steps (no pred -> no loss)
        The number 30, used when creating the lower_triangle_ones matrix, is the maximum
        sequence length in our dataset
        """

        lower_triangular_ones = tf.constant(np.tril(
            np.ones([self._max_length, self._max_length])),
                                            dtype=tf.float32)
        seqlen_mask = tf.slice(tf.gather(lower_triangular_ones, self.seqlen - 1), \
                               [0, 0], [self._batch_size, self._max_length])

        # RNN
        state_size = self._emb_dim
        num_classes = self._class_num

        cell = tf.contrib.rnn.BasicRNNCell(state_size)

        init_state = tf.get_variable('init_state', [1, state_size],
                                     initializer=tf.constant_initializer(0.0))
        init_state = tf.tile(init_state, [self._batch_size, 1])
        rnn_outputs, final_state = tf.nn.dynamic_rnn(
            cell,
            self.x_one_hot,
            sequence_length=self.seqlen,
            initial_state=init_state)

        y_reshaped = tf.reshape(self.y, [-1])
        """
        decoder
        """
        #en_last_output = self.last_relevant(rnn_outputs, self.seqlen)
        idx = tf.range(self._batch_size) * tf.shape(rnn_outputs)[1] + (
            self.seqlen - 1)
        last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]),
                                    idx)

        with tf.variable_scope('decoder'):
            decoder_cell = tf.contrib.rnn.BasicRNNCell(self._emb_dim)
        dec_input = last_rnn_output
        dec_in_state = final_state
        dec_outputs = []
        with tf.variable_scope('multi_decoder') as scope:
            for id in range(self._max_length):
                if id > 0:
                    scope.reuse_variables()
                dec_output, dec_out_state = seq2seq_lib.rnn_decoder(
                    [dec_input], dec_in_state, decoder_cell)
                # variable_scope.get_variable_scope().reuse_variables()
                dec_input = dec_output[0]
                dec_in_state = dec_out_state
                dec_outputs += dec_output

        dec_final_output = tf.concat(dec_outputs, axis=0)

        # Softmax layer
        with tf.variable_scope('softmax'):
            #W = tf.get_variable('W', [state_size, num_classes])
            #b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
            W = tf.Variable(
                tf.truncated_normal([self._emb_dim, self._class_num],
                                    stddev=0.01))
            # weight = tf.Variable([self._emb_dim, self._class_num])
            b = tf.Variable(tf.constant(0.1, shape=[
                self._class_num,
            ]))

        logits = tf.matmul(dec_final_output, W) + b

        #order not the same as y
        l1 = tf.reshape(logits, [self._max_length, -1, self._class_num])
        l2 = tf.transpose(l1, [1, 0, 2])
        logits = tf.reshape(l2, [-1, self._class_num])

        preds = tf.nn.softmax(logits)

        # To calculate the number correct, we want to count padded steps as incorrect
        correct = tf.cast(tf.equal(tf.cast(tf.argmax(preds, 1), tf.int32), y_reshaped), tf.int32) * \
                  tf.cast(tf.reshape(seqlen_mask, [-1]), tf.int32)

        final_output = tf.argmax(preds, 1)
        truevalue = y_reshaped

        # To calculate accuracy we want to divide by the number of non-padded time-steps,
        # rather than taking the mean
        accuracy = tf.reduce_sum(tf.cast(correct, tf.float32)) / tf.reduce_sum(
            tf.cast(self.seqlen, tf.float32))

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y_reshaped, logits=logits)
        loss = loss * tf.reshape(seqlen_mask, [-1])

        # To calculate average loss, we need to divide by number of non-padded time-steps,
        # rather than taking the mean
        loss = tf.reduce_sum(loss) / tf.reduce_sum(seqlen_mask)

        optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)

        saver = tf.train.Saver()

        with tf.Session() as sess:
            #sess.run(tf.global_variables_initializer())
            saver.restore(sess, "savemodel/twornn.ckpt")
            print("Model restored.")
            learning_rate = 5 * 1e-3
            hidden_code = []
            rnn_code = []
            total_acc = []
            for test_batch in range(self._batch_num_test + 1):
                batch_testX, batch_y, batch_testlen = self.getNextTestBatch(
                    self._batch_size, test_batch)
                feed = {
                    self.x: batch_testX,
                    self.y: batch_y,
                    self.seqlen: batch_testlen,
                    self.learning_rate: learning_rate
                }
                t, f, code, lro, acc = sess.run([
                    truevalue, final_output, final_state, last_rnn_output,
                    accuracy
                ],
                                                feed_dict=feed)
                code = code.reshape([-1, self._emb_dim])
                hidden_code.append(code)
                lro = lro.reshape([-1, self._emb_dim])
                rnn_code.append(lro)
                total_acc.append(acc)
                #print("Batch: "+str(test_batch))
                print("True" + str(t[0:self._max_length]))
                print("Pred" + str(f[0:self._max_length]))

            total_acc = np.mean(np.array(total_acc))
            print("Accuracy:" + str(total_acc))
            codes = np.array(hidden_code).reshape(-1, self._emb_dim)
            df = pd.DataFrame(codes[0:len(self.testdata), :])
            #file_hidden="twornn_hidden"+train_filename[4:len(train_filename)-4]+"_"+str(self._emb_dim)+".csv"
            file_hidden = "code2.csv"
            df.to_csv(file_hidden, float_format='%.5f')
            #df = pd.DataFrame(np.array(rnn_code).reshape(-1, self._emb_dim))
            #df.to_csv("twornn_output_airline12.csv", float_format='%.5f')

        return
    def __init__(self, img_width, img_height, nb_locations,
                 glimpse_width, glimpse_height,
                 g_size, l_size, glimpse_output_size, loc_dim, time_dim, variance,
                 cell_size, nb_glimpses, nb_classes, learning_rate, learning_rate_decay_factor,
                 min_learning_rate, nb_training_batch, max_gradient_norm, is_training=False):

        self.img_ph = tf.placeholder(tf.float32, [None, img_height, img_width])
        self.lbl_ph = tf.placeholder(tf.int64, [None])

        self.global_step = tf.Variable(0, trainable=False)
        # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / training_batch_num)
        self.learning_rate = tf.maximum(tf.train.exponential_decay(
            learning_rate, self.global_step,
            nb_training_batch, # batch number
            learning_rate_decay_factor,
            # If the argument staircase is True,
            # then global_step / decay_steps is an integer division
            # and the decayed learning rate follows a staircase function.
            staircase=True),
            min_learning_rate)

        cell = BasicLSTMCell(cell_size)


        with tf.variable_scope('GlimpseNetwork'):
            glimpse_network = GlimpseNetwork(img_width,
                                             img_height,
                                             glimpse_width,
                                             glimpse_height,
                                             loc_dim+time_dim,
                                             g_size,
                                             l_size,
                                             glimpse_output_size,
                                             nb_locations)
        with tf.variable_scope('LocationNetwork'):
            location_network = LocationNetwork(loc_dim=loc_dim*nb_locations+time_dim,
                                               rnn_output_size=cell.output_size, # cell_size
                                               variance=variance,
                                               is_sampling=is_training)

        # with tf.variable_scope('CNN'):
        #     cnn = CNN(nb_locations, glimpse_output_size)

        # with tf.variable_scope('CDD'):
        #     cdd = CDD(glimpse_height, nb_locations*glimpse_output_size)

        # Core Network
        batch_size = tf.shape(self.img_ph)[0]
        init_loc_1 = tf.random_uniform((batch_size, loc_dim), minval=-1, maxval=1)
        init_loc_2 = tf.random_uniform((batch_size, loc_dim), minval=-1, maxval=1)
        init_loc_3 = tf.random_uniform((batch_size, loc_dim), minval=-1, maxval=1)
        init_t = tf.random_uniform((batch_size, loc_dim), minval=-1, maxval=1)
        # shape: (batch_size, loc_dim), range: [-1,1)
        init_state = cell.zero_state(batch_size, tf.float32)

        self.init_glimpse = glimpse_network(self.img_ph, init_loc_1, init_loc_2, init_loc_3,
                                                                         init_t)
        # self.init_glimpse_cooperate = cnn(self.init_glimpse)

        # self.imgs_ph, self.imgs_ph_re, self.h_fc1, self.conv_2d_1st, self.conv_2d_2nd, self.conv_2d_flat = cdd(self.init_glimpse)

        rnn_inputs = [self.init_glimpse]
        rnn_inputs.extend([0] * nb_glimpses)

        locs, loc_means = [], []

        def loop_function(prev, _):
            loc, loc_mean = location_network(prev)
            locs.append(loc)
            loc_means.append(loc_mean)
            glimpse = glimpse_network(self.img_ph, tf.reshape(loc[:,0],[-1,1]),
                                      tf.reshape(loc[:, 1], [-1, 1]),
                                      tf.reshape(loc[:, 2], [-1, 1]),
                                      tf.reshape(loc[:, 3], [-1, 1]))
            # glimpse_cooperate = cnn(glimpse)
            return glimpse

        rnn_outputs, _ = rnn_decoder(rnn_inputs, init_state, cell, loop_function=loop_function)

        # Time independent baselines
        with tf.variable_scope('Baseline'):
            baseline_w = _weight_variable((cell.output_size, 1))
            baseline_b = _bias_variable((1,))
        baselines = []
        for output in rnn_outputs[1:]:
            baseline = tf.nn.xw_plus_b(output, baseline_w, baseline_b)
            baseline = tf.squeeze(baseline)
            baselines.append(baseline)
        baselines = tf.stack(baselines)  # [timesteps, batch_sz]
        baselines = tf.transpose(baselines)  # [batch_sz, timesteps]

        # Classification. Take the last step only.
        rnn_last_output = rnn_outputs[-1]
        with tf.variable_scope('Classification'):
            logit_w = _weight_variable((cell.output_size, nb_classes))
            logit_b = _bias_variable((nb_classes,))
        logits = tf.nn.xw_plus_b(rnn_last_output, logit_w, logit_b)
        # self.prediction = tf.argmax(logits, 1)
        self.softmax = tf.nn.softmax(logits)

        self.pred = tf.argmax(self.softmax, 1)
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.pred, self.lbl_ph), tf.float32))


        if is_training:
            # classification loss
            self.cross_entropy = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.lbl_ph, logits=logits))
            # RL reward
            reward = tf.cast(tf.equal(self.pred, self.lbl_ph), tf.float32)
            rewards = tf.expand_dims(reward, 1)  # [batch_sz, 1]
            rewards = tf.tile(rewards, (1, nb_glimpses))  # [batch_sz, timesteps]
            advantages = rewards - tf.stop_gradient(baselines)
            self.advantage = tf.reduce_mean(advantages)
            logll = _log_likelihood(loc_means, locs, variance)
            logllratio = tf.reduce_mean(logll * advantages)
            self.reward = tf.reduce_mean(reward)
            # baseline loss
            self.baselines_mse = tf.reduce_mean(tf.square((rewards - baselines)))
            # hybrid loss

            self.loss = -logllratio + self.cross_entropy + self.baselines_mse
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
            self.train_op = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(
                zip(clipped_gradients, params), global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=99999999)
示例#15
0
    def __init__(self,
                 img_size_width,
                 img_size_height,
                 CNN_patch_width,
                 CNN_patch_height,
                 CNN_patch_number,
                 patch_window_width,
                 patch_window_height,
                 g_size,
                 l_size,
                 glimpse_output_size,
                 loc_dim,
                 variance,
                 cell_size,
                 num_glimpses,
                 num_classes,
                 learning_rate,
                 learning_rate_decay_factor,
                 min_learning_rate,
                 training_batch_num,
                 max_gradient_norm,
                 last_lstm_size,
                 n_time_window,
                 is_training=False):

        self.img_ph = tf.placeholder(tf.float32,
                                     [None, img_size_width * img_size_height])
        self.lbl_ph = tf.placeholder(tf.int64, [None])

        self.global_step = tf.Variable(0, trainable=False)
        # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / training_batch_num)
        self.learning_rate = tf.maximum(
            tf.train.exponential_decay(
                learning_rate,
                self.global_step,
                training_batch_num,  # batch number
                learning_rate_decay_factor,
                # If the argument staircase is True,
                # then global_step / decay_steps is an integer division
                # and the decayed learning rate follows a staircase function.
                staircase=True),
            min_learning_rate)

        cell = BasicLSTMCell(cell_size)

        with tf.variable_scope('CNN'):
            cnn_network = CNN(img_size_width, img_size_height, CNN_patch_width,
                              CNN_patch_height, CNN_patch_number)

        with tf.variable_scope('GlimpseNetwork'):
            glimpse_network = GlimpseNetwork(img_size_width, img_size_height,
                                             patch_window_width,
                                             patch_window_height, loc_dim,
                                             g_size, l_size,
                                             glimpse_output_size)
        with tf.variable_scope('LocationNetwork'):
            location_network = LocationNetwork(
                loc_dim=loc_dim,
                rnn_output_size=cell.output_size,  # cell_size
                variance=variance,
                is_sampling=is_training)

        # Core Network
        self.img_ph = cnn_network(self.img_ph)
        batch_size = tf.shape(self.img_ph)[0]  # training_batch_size * M
        init_loc = tf.random_uniform((batch_size, loc_dim),
                                     minval=-1,
                                     maxval=1)
        # shape: (batch_size, loc_dim), range: [-1,1)
        init_state = cell.zero_state(batch_size, tf.float32)

        init_glimpse = glimpse_network(self.img_ph, init_loc)
        rnn_inputs = [init_glimpse]
        rnn_inputs.extend([0] * num_glimpses)

        self.locs, loc_means = [], []

        def loop_function(prev, _):
            loc, loc_mean = location_network(prev)
            self.locs.append(loc)
            loc_means.append(loc_mean)
            glimpse = glimpse_network(self.img_ph, loc)
            return glimpse

        rnn_outputs, _ = rnn_decoder(rnn_inputs,
                                     init_state,
                                     cell,
                                     loop_function=loop_function)

        # Time independent baselines
        with tf.variable_scope('Baseline'):
            baseline_w = _weight_variable((cell.output_size, 1))
            baseline_b = _bias_variable((1, ))
        baselines = []
        for output in rnn_outputs[1:]:
            baseline = tf.nn.xw_plus_b(output, baseline_w, baseline_b)
            baseline = tf.squeeze(baseline)
            baselines.append(baseline)
        baselines = tf.stack(baselines)  # [timesteps, batch_sz]
        baselines = tf.transpose(baselines)  # [batch_sz, timesteps]

        # Classification. Take the last step only.
        rnn_last_output = rnn_outputs[-1]
        with tf.variable_scope('Classification'):
            logit_w = _weight_variable((cell.output_size, num_classes))
            logit_b = _bias_variable((num_classes, ))
        logits = tf.nn.xw_plus_b(rnn_last_output, logit_w, logit_b)
        self.prediction = tf.argmax(logits, 1)
        self.softmax = tf.nn.softmax(logits)

        with tf.variable_scope('LSTM_Classification'):
            last_lstm_w_in = _weight_variable(
                (cell.output_size, last_lstm_size))
            last_lstm_b_in = _bias_variable((last_lstm_size, ))
            last_lstm_in = tf.matmul(rnn_last_output,
                                     last_lstm_w_in) + last_lstm_b_in
            last_lstm_in = tf.reshape(last_lstm_in,
                                      [-1, n_time_window, last_lstm_size])

            if int((tf.__version__).split('.')[1]) < 12 and int(
                (tf.__version__).split('.')[0]) < 1:
                cell = tf.nn.rnn_cell.BasicLSTMCell(last_lstm_size,
                                                    forget_bias=1.0,
                                                    state_is_tuple=True)
            else:
                cell = tf.contrib.rnn.BasicLSTMCell(last_lstm_size)
            # lstm cell is divided into two parts (c_state, h_state)
            init_state_last_lstm = cell.zero_state(batch_size // n_time_window,
                                                   dtype=tf.float32)
            lstm_outputs, final_state = tf.nn.dynamic_rnn(
                cell,
                last_lstm_in,
                initial_state=init_state_last_lstm,
                time_major=False)
            last_lstm_w_out = _weight_variable((cell.output_size, num_classes))
            last_lstm_b_out = _bias_variable((num_classes, ))

            if int((tf.__version__).split('.')[1]) < 12 and int(
                (tf.__version__).split('.')[0]) < 1:
                lstm_outputs = tf.unpack(tf.transpose(
                    lstm_outputs, [1, 0, 2]))  # states is the last outputs
            else:
                lstm_outputs = tf.unstack(tf.transpose(lstm_outputs,
                                                       [1, 0, 2]))
            lstm_logits = tf.matmul(lstm_outputs[-1],
                                    last_lstm_w_out) + last_lstm_b_out
            lstm_logits = tf.reshape(tf.tile(lstm_logits, (1, n_time_window)),
                                     [-1, num_classes])
            self.lstm_prediction = tf.argmax(lstm_logits, 1)
            self.lstm_softmax = tf.nn.softmax(lstm_logits)

        if is_training:
            # classification loss
            self.cross_entropy = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.lbl_ph, logits=logits))
            self.lstm_cross_entropy = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.lbl_ph, logits=lstm_logits))
            # RL reward
            reward = tf.cast(tf.equal(self.prediction, self.lbl_ph),
                             tf.float32)
            rewards = tf.expand_dims(reward, 1)  # [batch_sz, 1]
            rewards = tf.tile(rewards,
                              (1, num_glimpses))  # [batch_sz, timesteps]
            advantages = rewards - tf.stop_gradient(baselines)
            self.advantage = tf.reduce_mean(advantages)
            logll = _log_likelihood(loc_means, self.locs, variance)
            logllratio = tf.reduce_mean(logll * advantages)
            self.reward = tf.reduce_mean(reward)
            # baseline loss
            self.baselines_mse = tf.reduce_mean(
                tf.square((rewards - baselines)))
            # hybrid loss
            self.loss = -logllratio + self.cross_entropy + self.baselines_mse + self.lstm_cross_entropy
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.train_op = tf.train.AdamOptimizer(
                self.learning_rate).apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=99999999)
示例#16
0
    def __init__(self,
                 args,
                 infer=False):  # infer is set to true during sampling.
        self.args = args
        if infer:
            # Worry about one character at a time during sampling; no batching or BPTT.
            args.batch_size = 1
            args.seq_length = 1

        # Set cell_fn to the type of network cell we're creating -- RNN, GRU or LSTM.
        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        # Call tensorflow library tensorflow-master/tensorflow/python/ops/rnn_cell
        # to create a layer of rnn_size cells of the specified basic type (RNN/GRU/LSTM).
        if args.model == "gru":
            cell = cell_fn(args.rnn_size)
        else:
            cell = cell_fn(args.rnn_size, state_is_tuple=True)

        # Use the same rnn_cell library to create a stack of these cells
        # of num_layers layers. Pass in a python list of these cells.
        # (The [cell] * arg.num_layers syntax literally duplicates cell multiple times in
        # a list. The syntax is such that [5, 6] * 3 would return [5, 6, 5, 6, 5, 6].)
        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers,
                                                 state_is_tuple=True)

        # Create two TF placeholder nodes of 32-bit ints (NOT floats!),
        # each of shape batch_size x seq_length. This shape matches the batches
        # (listed in x_batches and y_batches) constructed in create_batches in utils.py.
        # input_data will receive input batches, and targets will be what it compares against
        # to calculate loss.
        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])

        # Using the zero_state function in the RNNCell master class in rnn_cell library,
        # create a tensor of zeros such that we can swap it in for the network state at any time
        # to zero out the network's state.
        # State dimensions are: cell_fn state size (2 for LSTM) x rnn_size x num_layers.
        # So an LSTM network with 100 cells per layer and 3 layers would have a state size of 600,
        # and initial_state would have a dimension of none x 600.
        self.initial_state = self.cell.zero_state(args.batch_size, tf.float32)

        # Scope our new variables to the scope identifier string "rnnlm".
        with tf.variable_scope('rnnlm'):
            # Create new variable softmax_w and softmax_b for output.
            # softmax_w is a weights matrix from the top layer of the model (of size rnn_size)
            # to the vocabulary output (of size vocab_size).
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            # softmax_b is a bias vector of the ouput characters (of size vocab_size).
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            # [TODO: Why specify CPU? Same as the TF translation tutorial, but don't know why.]
            with tf.device("/cpu:0"):
                # Create new variable named 'embedding' to connect the character input to the base layer
                # of the RNN. Its role is the conceptual inverse of softmax_w.
                # It contains the trainable weights from the one-hot input vector to the lowest layer of RNN.
                embedding = tf.get_variable("embedding",
                                            [args.vocab_size, args.rnn_size])
                # Create an embedding tensor with tf.nn.embedding_lookup(embedding, self.input_data).
                # This tensor has dimensions batch_size x seq_length x rnn_size.
                # tf.split splits that embedding lookup tensor into seq_length tensors (along dimension 1).
                # Thus inputs is a list of seq_length different tensors,
                # each of dimension batch_size x 1 x rnn_size.
                inputs = tf.split(tf.nn.embedding_lookup(
                    embedding, self.input_data),
                                  args.seq_length,
                                  axis=1)
                # Iterate through these resulting tensors and eliminate that degenerate second dimension of 1,
                # i.e. squeeze each from batch_size x 1 x rnn_size down to batch_size x rnn_size.
                # Thus we now have a list of seq_length tensors, each with dimension batch_size x rnn_size.
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        # THIS LOOP FUNCTION IS NEVER ACTUALLY USED.
        # IT IS EXPLICITLY NOT USED DURING TRAINING.
        # DURING INFERENCE, SEQ_LENGTH == 1, SO SEQ2SEQ.RNN_DECODER() ONLY USES THE LOOP ARGUMENT
        # ON SEQUENCE LENGTH ITEMS SUBSEQUENT TO THE FIRST.
        # This looping function is used as part of seq2seq.rnn_decoder only during sampling -- not training.
        # prev is a 2D Tensor of shape [batch_size x cell.output_size].
        # returns a 2D Tensor of shape [batch_size x cell.input_size].
        def loop(prev, _):
            # prev is initially the top cell state.
            # Convert the top cell state into character logits.
            prev = tf.matmul(prev, softmax_w) + softmax_b
            # Pull the character with the greatest logit (no sampling, just argmaxing).
            # WHY IS THIS ARGMAXING WHEN ACTUAL SAMPLING IS DONE PROBABILISTICALLY?
            # DOESN'T THIS CAUSE OUTPUTS NOT TO MATCH INPUTS DURING SEQUENCE GENERATION?
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            # Re-embed that symbol as the next step's input, and return that.
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        # Set up a seq2seq decoder from the seq2seq.py library.
        # This constructs the outputs and states nodes of the network.
        # Outputs is a list (of len seq_length, same as inputs) of tensors of shape [batch_size x rnn_size].
        # These are the raw output values of the top layer of the network at each time step.
        # They have NOT been fed through the decoder projection; they are still in network space,
        # not character space.
        # State is a tensor of shape [batch_size x cell.state_size].
        # This is also the step where all of the trainable parameters for the LSTM (weights and biases) are defined.
        outputs, self.final_state = seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope='rnnlm')
        # tf.concat concatenates the output tensors along the rnn_size dimension,
        # to make a single tensor of shape [batch_size x (seq_length * rnn_size)].
        # This gives the following 2D outputs matrix:
        #   [(rnn output: batch 0, seq 0) (rnn output: batch 0, seq 1) ... (rnn output: batch 0, seq seq_len-1)]
        #   [(rnn output: batch 1, seq 0) (rnn output: batch 1, seq 1) ... (rnn output: batch 1, seq seq_len-1)]
        #   ...
        #   [(rnn output: batch batch_size-1, seq 0) (rnn output: batch batch_size-1, seq 1) ... (rnn output: batch batch_size-1, seq seq_len-1)]
        # tf.reshape then reshapes it to a tensor of shape [(batch_size * seq_length) x rnn_size].
        # Output will now be the following matrix:
        #   [rnn output: batch 0, seq 0]
        #   [rnn output: batch 0, seq 1]
        #   ...
        #   [rnn output: batch 0, seq seq_len-1]
        #   [rnn output: batch 1, seq 0]
        #   [rnn output: batch 1, seq 1]
        #   ...
        #   [rnn output: batch 1, seq seq_len-1]
        #   ...
        #   ...
        #   [rnn output: batch batch_size-1, seq seq_len-1]
        # Note the following comment in rnn_cell.py:
        #   Note: in many cases it may be more efficient to not use this wrapper,
        #   but instead concatenate the whole sequence of your outputs in time,
        #   do the projection on this batch-concatenated sequence, then split it
        #   if needed or directly feed into a softmax.
        output = tf.reshape(tf.concat(outputs, axis=1), [-1, args.rnn_size])
        # Obtain logits node by applying output weights and biases to the output tensor.
        # Logits is a tensor of shape [(batch_size * seq_length) x vocab_size].
        # Recall that outputs is a 2D tensor of shape [(batch_size * seq_length) x rnn_size],
        # and softmax_w is a 2D tensor of shape [rnn_size x vocab_size].
        # The matrix product is therefore a new 2D tensor of [(batch_size * seq_length) x vocab_size].
        # In other words, that multiplication converts a loooong list of rnn_size vectors
        # to a loooong list of vocab_size vectors.
        # Then add softmax_b (a single vocab-sized vector) to every row of that list.
        # That gives you the logits!
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        # Convert logits to probabilities. Probs isn't used during training! That node is never calculated.
        # Like logits, probs is a tensor of shape [(batch_size * seq_length) x vocab_size].
        # During sampling, this means it is of shape [1 x vocab_size].
        self.probs = tf.nn.softmax(self.logits)
        # seq2seq.sequence_loss_by_example returns 1D float Tensor containing the log-perplexity
        # for each sequence. (Size is batch_size * seq_length.)
        # Targets are reshaped from a [batch_size x seq_length] tensor to a 1D tensor, of the following layout:
        #   target character (batch 0, seq 0)
        #   target character (batch 0, seq 1)
        #   ...
        #   target character (batch 0, seq seq_len-1)
        #   target character (batch 1, seq 0)
        #   ...
        # These targets are compared to the logits to generate loss.
        # Logits: instead of a list of character indices, it's a list of character index probability vectors.
        # seq2seq.sequence_loss_by_example will do the work of generating losses by comparing the one-hot vectors
        # implicitly represented by the target characters against the probability distrutions in logits.
        # It returns a 1D float tensor (a vector) where item i is the log-perplexity of
        # the comparison of the ith logit distribution to the ith one-hot target vector.
        loss = seq2seq.sequence_loss_by_example(
            [self.logits],
            # logits: 1-item list of 2D Tensors of shape [batch_size x vocab_size]
            [tf.reshape(self.targets, [-1])],
            # targets: 1-item list of 1D batch-sized int32 Tensors of the same length as logits
            [tf.ones([args.batch_size * args.seq_length])],
            # weights: 1-item list of 1D batch-sized float-Tensors of the same length as logits
            args.vocab_size
        )  # num_decoder_symbols: integer, number of decoder symbols (output classes)
        # Cost is the arithmetic mean of the values of the loss tensor
        # (the sum divided by the total number of elements).
        # It is a single-element floating point tensor. This is what the optimizer seeks to minimize.
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        # Create a summary for our cost.
        tf.summary.scalar("cost", self.cost)
        # Create a node to track the learning rate as it decays through the epochs.
        self.lr = tf.Variable(args.learning_rate, trainable=False)
        self.global_epoch_fraction = tf.Variable(0.0, trainable=False)
        self.global_seconds_elapsed = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables(
        )  # tvars is a python list of all trainable TF Variable objects.

        # tf.gradients returns a list of tensors of length len(tvars) where each tensor is sum(dy/dx).
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdamOptimizer(
            self.lr)  # Use ADAM optimizer with the current learning rate.
        # Zip creates a list of tuples, where each tuple is (variable tensor, gradient tensor).
        # Training op nudges the variables along the gradient, with the given learning rate, using the ADAM optimizer.
        # This is the op that a training session should be instructed to perform.
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
        self.summary_op = tf.summary.merge_all()
示例#17
0
    def __init__(self,
                 img_shape,
                 pth_size,
                 g_size,
                 l_size,
                 glimpse_output_size,
                 loc_dim,
                 variance,
                 cell_size,
                 num_glimpses,
                 num_classes,
                 learning_rate,
                 learning_rate_decay_factor,
                 min_learning_rate,
                 training_steps_per_epoch,
                 max_gradient_norm,
                 is_training=False):
        self.is_training = is_training
        self.img_ph = tf.placeholder(
            tf.float32, [None, img_shape[0], img_shape[1], img_shape[2]])
        self.lbl_ph = tf.placeholder(tf.int64, [None])

        self.global_step = tf.Variable(0, trainable=False)

        self.learning_rate = tf.maximum(
            tf.train.exponential_decay(learning_rate,
                                       self.global_step,
                                       training_steps_per_epoch,
                                       learning_rate_decay_factor,
                                       staircase=True), min_learning_rate)

        cell = BasicLSTMCell(cell_size)

        with tf.variable_scope('GlimpseNetwork'):
            glimpse_network = GlimpseNetwork(img_shape, pth_size, loc_dim,
                                             g_size, l_size,
                                             glimpse_output_size)
        with tf.variable_scope('LocationNetwork'):
            location_network = LocationNetwork(
                loc_dim=loc_dim,
                rnn_output_size=cell.output_size,
                variance=variance,
                is_sampling=self.is_training)

    # Core Network
        batch_size = tf.shape(self.img_ph)[0]
        init_loc = tf.random_uniform((batch_size, loc_dim),
                                     minval=-1,
                                     maxval=1)
        init_state = cell.zero_state(batch_size, tf.float32)

        init_glimpse = glimpse_network(self.img_ph, init_loc)
        rnn_inputs = [init_glimpse]
        rnn_inputs.extend([0] * num_glimpses)

        locs, loc_means = [], []

        def loop_function(prev, _):
            loc, loc_mean = location_network(prev, self.is_training)
            locs.append(loc)
            loc_means.append(loc_mean)
            glimpse = glimpse_network(self.img_ph, loc)
            return glimpse

        rnn_outputs, _ = rnn_decoder(rnn_inputs,
                                     init_state,
                                     cell,
                                     loop_function=loop_function)
        # to be displyed
        self.locs = locs
        # Time independent baselines
        with tf.variable_scope('Baseline'):
            baseline_w = _weight_variable((cell.output_size, 1))
            baseline_b = _bias_variable((1, ))
        baselines = []
        for output in rnn_outputs[1:]:
            baseline = tf.nn.xw_plus_b(output, baseline_w, baseline_b)
            baseline = tf.squeeze(baseline)
            baselines.append(baseline)
        baselines = tf.stack(baselines)  # [timesteps, batch_sz]
        baselines = tf.transpose(baselines)  # [batch_sz, timesteps]

        # Classification. Take the last step only.
        rnn_last_output = rnn_outputs[-1]
        with tf.variable_scope('Classification'):
            logit_w = _weight_variable((cell.output_size, num_classes))
            logit_b = _bias_variable((num_classes, ))
        logits = tf.nn.xw_plus_b(rnn_last_output, logit_w, logit_b)
        self.prediction = tf.argmax(logits, 1)
        self.softmax = tf.nn.softmax(logits)

        if self.is_training:
            # classification loss
            #self.xent = focal_loss(logits, self.lbl_ph)#
            self.xent = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.lbl_ph, logits=logits))
            # RL reward
            reward = tf.cast(tf.equal(self.prediction, self.lbl_ph),
                             tf.float32)
            # reward = tf.multiply(tf.cast(tf.equal(self.prediction, self.lbl_ph), tf.float32),0.1) + tf.multiply(tf.cast(tf.multiply(self.prediction, self.lbl_ph), tf.float32),0.9)
            rewards = tf.expand_dims(reward, 1)  # [batch_sz, 1]
            rewards = tf.tile(rewards,
                              (1, num_glimpses))  # [batch_sz, timesteps]
            advantages = rewards - tf.stop_gradient(baselines)
            self.advantage = tf.reduce_mean(advantages)
            logll = _log_likelihood(loc_means, locs, variance)
            logllratio = tf.reduce_mean(logll * advantages)
            self.reward = tf.reduce_mean(reward)
            # baseline loss
            self.baselines_mse = tf.reduce_mean(
                tf.square((rewards - baselines)))
            # hybrid loss
            self.loss = -logllratio + self.xent + self.baselines_mse
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.train_op = tf.train.AdamOptimizer(
                self.learning_rate).apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=99999999)
示例#18
0
    def __init__(self,
                 imgSize,
                 vocabSize,
                 embedSize,
                 use_lstm,
                 rnnHiddenSize,
                 rnnLayers,
                 start,
                 end,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 min_learning_rate,
                 training_steps_per_epoch,
                 keep_prob=0.5,
                 max_gradient_norm=5.0,
                 is_training=True):

        if is_training:
            self.global_step = tf.Variable(0, trainable=False)
            self.learning_rate = tf.maximum(
                tf.train.exponential_decay(learning_rate,
                                           self.global_step,
                                           training_steps_per_epoch,
                                           learning_rate_decay_factor,
                                           staircase=True), min_learning_rate)
            self.answers_ph = tf.placeholder(tf.int32,
                                             shape=[batch_size, 10, 20],
                                             name="answers")
            self.answer_lengths_ph = tf.placeholder(tf.int32,
                                                    shape=[batch_size, 10],
                                                    name="answer_lengths")
            self.targets_ph = tf.placeholder(tf.int32,
                                             shape=[batch_size, 10, 21],
                                             name="targets")

        self.image_feature_ph = tf.placeholder(tf.float32,
                                               shape=[batch_size, imgSize],
                                               name="image_feature")

        self.caption_ph = tf.placeholder(tf.int32,
                                         shape=[batch_size, 40],
                                         name="caption")
        self.caption_length_ph = tf.placeholder(tf.int32,
                                                shape=[batch_size],
                                                name="caption_length")

        self.questions_ph = tf.placeholder(tf.int32,
                                           shape=[batch_size, 10, 20],
                                           name="questions")
        self.question_lengths_ph = tf.placeholder(tf.int32,
                                                  shape=[batch_size, 10],
                                                  name="question_lengths")

        START = tf.constant(value=[start] * batch_size)
        END = tf.constant(value=[end] * batch_size)

        # Embedding (share)
        with ops.device("/cpu:0"):
            if vs.get_variable_scope().initializer:
                initializer = vs.get_variable_scope().initializer
            else:
                # Default initializer for embeddings should have variance=1.
                sqrt3 = math.sqrt(
                    3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
                initializer = init_ops.random_uniform_initializer(
                    -sqrt3, sqrt3)
                embedding = vs.get_variable("embedding",
                                            [vocabSize, embedSize],
                                            initializer=initializer,
                                            dtype=tf.float32)

        START_EMB = embedding_ops.embedding_lookup(embedding, START)
        END_EMB = embedding_ops.embedding_lookup(embedding, END)

        # split placeholders and embed
        questions = tf.split(
            value=self.questions_ph, num_or_size_splits=10,
            axis=1)  # list with length 10; questions[0]: [batch_size, 1, 20]
        questions = [
            tf.squeeze(input=question, axis=1) for question in questions
        ]  # list with length 10; questions[0]: [batch_size, 20]
        questions = [
            embedding_ops.embedding_lookup(embedding, question)
            for question in questions
        ]  # list with length 10; questions[0]: [batch_size, 20, embedSize]

        question_lengths = tf.split(value=self.question_lengths_ph,
                                    num_or_size_splits=10,
                                    axis=1)
        question_lengths = [
            tf.squeeze(question_length) for question_length in question_lengths
        ]

        if is_training:
            answers = tf.split(value=self.answers_ph,
                               num_or_size_splits=10,
                               axis=1)
            answers = [tf.squeeze(input=answer, axis=1) for answer in answers]
            answers = [
                embedding_ops.embedding_lookup(embedding, answer)
                for answer in answers
            ]

            answer_lengths = tf.split(value=self.answer_lengths_ph,
                                      num_or_size_splits=10,
                                      axis=1)
            answer_lengths = [
                tf.squeeze(answer_length) for answer_length in answer_lengths
            ]

            targets = tf.split(value=self.targets_ph,
                               num_or_size_splits=10,
                               axis=1)
            targets = [tf.squeeze(input=target, axis=1) for target in targets]

            weights = []
            for r in range(10):
                weight = []
                answer_length = answer_lengths[r]
                for i in range(21):
                    weight.append(tf.greater_equal(x=answer_length, y=i))
                weight = tf.cast(x=tf.stack(values=weight, axis=1),
                                 dtype=tf.float32)  # [batch_size, 21]
                weights.append(weight)

        # make RNN cell
        def single_cell():
            return GRUCell(rnnHiddenSize)

        if use_lstm:

            def single_cell():
                return BasicLSTMCell(rnnHiddenSize, state_is_tuple=False)

        make_cell = single_cell
        if rnnLayers > 1:

            def make_cell():
                return MultiRNNCell([single_cell() for _ in range(rnnLayers)],
                                    state_is_tuple=False)

        encoder_cell = make_cell()
        decoder_cell = OutputProjectionWrapper(cell=make_cell(),
                                               output_size=vocabSize,
                                               activation=None)

        # caption feature
        caption = embedding_ops.embedding_lookup(
            embedding, self.caption_ph)  # [batch_size, 40, embedSize]
        caption_length = tf.squeeze(self.caption_length_ph)
        with tf.variable_scope('EncoderRNN') as varscope:
            _, captionState = dynamic_rnn(
                cell=encoder_cell,
                inputs=caption,
                sequence_length=caption_length,
                dtype=tf.float32,
                scope=varscope)  # [batch_size, encoder_cell.state_size]

        if is_training:
            losses = []
        else:
            ans_word_probs = []

        for r in range(10):
            # 1. question
            with tf.variable_scope('EncoderRNN', reuse=True) as varscope:
                _, questionState = dynamic_rnn(
                    cell=encoder_cell,
                    inputs=questions[r],
                    sequence_length=question_lengths[r],
                    dtype=tf.float32,
                    scope=varscope)

            # 2. history
            if r == 0:
                historyState = captionState

            # 3. fusion
            concat = tf.concat(
                values=[self.image_feature_ph, questionState, historyState],
                axis=1)
            if is_training:
                concat = tf.nn.dropout(x=concat, keep_prob=keep_prob)
            with tf.variable_scope('Fusion', reuse=(r > 0)) as varscope:
                encoder_state = tf.contrib.layers.fully_connected(
                    inputs=concat,
                    num_outputs=decoder_cell.state_size,
                    activation_fn=tf.nn.tanh,
                    scope=varscope)

            # 4. decoder
            with tf.variable_scope('DecoderRNN', reuse=(r > 0)) as varscope:
                if is_training:
                    answer = [
                        tf.squeeze(input=word, axis=1) for word in tf.split(
                            value=answers[r], num_or_size_splits=20, axis=1)
                    ]
                    decoder_outputs, _ = rnn_decoder(
                        decoder_inputs=[START_EMB] + answer,
                        initial_state=encoder_state,
                        cell=decoder_cell,
                        loop_function=None,
                        scope=varscope)
                else:
                    self_answer = []
                    self_answer_emb = []

                    def loop_function(prev, _):
                        prev_symbol = math_ops.argmax(prev, 1)
                        self_answer.append(
                            tf.cast(x=prev_symbol, dtype=tf.int32))
                        emb_prev = embedding_ops.embedding_lookup(
                            embedding, prev_symbol)
                        self_answer_emb.append(emb_prev)
                        return emb_prev

                    decoder_outputs, _ = rnn_decoder(
                        decoder_inputs=[START_EMB] * 21,
                        initial_state=encoder_state,
                        cell=decoder_cell,
                        loop_function=loop_function,
                        scope=varscope)

            # 5. update history
            with tf.variable_scope('EncoderRNN', reuse=True) as varscope:
                _, historyState = dynamic_rnn(
                    cell=encoder_cell,
                    inputs=questions[r],
                    sequence_length=question_lengths[r],
                    initial_state=historyState,
                    scope=varscope)
                if is_training:
                    _, historyState = dynamic_rnn(
                        cell=encoder_cell,
                        inputs=answers[r],
                        sequence_length=answer_lengths[r],
                        initial_state=historyState,
                        scope=varscope)
                else:
                    self_answer = tf.stack(values=self_answer + [END],
                                           axis=1)  # [batch_size, 21]
                    self_answer_length = tf.argmax(input=tf.cast(
                        x=tf.equal(x=self_answer, y=end), dtype=tf.float32),
                                                   axis=1)
                    self_answer_emb = tf.stack(
                        values=self_answer_emb,
                        axis=1)  # [batch_size, 20, embSize]
                    _, historyState = dynamic_rnn(
                        cell=encoder_cell,
                        inputs=self_answer_emb,
                        sequence_length=self_answer_length,
                        initial_state=historyState,
                        scope=varscope)

            if is_training:
                decoder_outputs = tf.stack(
                    values=decoder_outputs,
                    axis=1)  # [batch_size, 21, vocabSize]
                loss = tf.contrib.seq2seq.sequence_loss(
                    logits=decoder_outputs,
                    targets=targets[r],
                    weights=weights[r],
                    average_across_batch=False)  # [batch_size]
                losses.append(loss)
            else:
                decoder_outputs = [
                    tf.log(tf.nn.softmax(decoder_output))
                    for decoder_output in decoder_outputs
                ]
                ans_word_probs.append(
                    tf.stack(values=decoder_outputs,
                             axis=1))  # [batch_size, 21, vocabSize]
        if is_training:
            losses = tf.stack(values=losses, axis=1)  # [batch_size, 10]
            self.loss = tf.reduce_mean(losses)
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.opt_op = tf.train.AdamOptimizer(
                self.learning_rate).apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)
        else:
            self.ans_word_probs = tf.stack(
                values=ans_word_probs,
                axis=1)  # [batch_size, 10, 21, vocabSize]

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=99999999)
示例#19
0
    def build(self):
        params = self.params
        N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count
        V, d, A = params.glove_size, params.hidden_size, self.words.vocab_size

        # initialize self
        # placeholders
        input = tf.placeholder(
            tf.float32, shape=[N, L, V],
            name='x')  # [num_batch, sentence_len, glove_dim]
        question = tf.placeholder(
            tf.float32, shape=[N, Q, V],
            name='q')  # [num_batch, sentence_len, glove_dim]
        answer = tf.placeholder(tf.int64, shape=[N],
                                name='y')  # [num_batch] - one word answer
        input_mask = tf.placeholder(tf.bool, shape=[N, L],
                                    name='x_mask')  # [num_batch, sentence_len]
        is_training = tf.placeholder(tf.bool)

        # Prepare parameters
        gru = rnn_cell.GRUCell(d)

        # Input module
        with tf.variable_scope('input') as scope:
            input_list = self.make_decoder_batch_input(input)
            input_states, _ = seq2seq.rnn_decoder(
                input_list, gru.zero_state(N, tf.float32), gru)

            # Question module
            scope.reuse_variables()

            ques_list = self.make_decoder_batch_input(question)
            questions, _ = seq2seq.rnn_decoder(ques_list,
                                               gru.zero_state(N, tf.float32),
                                               gru)
            question_vec = questions[-1]  # use final state

        # Masking: to extract fact vectors at end of sentence. (details in paper)
        input_states = tf.transpose(tf.stack(input_states),
                                    [1, 0, 2])  # [N, L, D]
        facts = []
        for n in range(N):
            filtered = tf.boolean_mask(input_states[n, :, :],
                                       input_mask[n, :])  # [?, D]
            padding = tf.zeros(tf.stack([F - tf.shape(filtered)[0], d]))
            facts.append(tf.concat(0, [filtered, padding]))  # [F, D]

        facked = tf.stack(facts)  # packing for transpose... I hate TF so much
        facts = tf.unstack(tf.transpose(facked, [1, 0, 2]),
                           num=F)  # F x [N, D]

        # Episodic Memory
        with tf.variable_scope('episodic') as scope:
            episode = EpisodeModule(d, question_vec, facts)

            memory = tf.identity(question_vec)
            for t in range(params.memory_step):
                memory = gru(episode.new(memory), memory)[0]
                scope.reuse_variables()

        # Regularizations
        if params.batch_norm:
            memory = batch_norm(memory, is_training=is_training)
        memory = dropout(memory, params.keep_prob, is_training)

        with tf.name_scope('Answer'):
            # Answer module : feed-forward version (for it is one word answer)
            w_a = weight('w_a', [d, A])
            logits = tf.matmul(memory, w_a)  # [N, A]

        with tf.name_scope('Loss'):
            # Cross-Entropy loss
            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, answer)
            loss = tf.reduce_mean(cross_entropy)
            total_loss = loss + params.weight_decay * tf.add_n(
                tf.get_collection('l2'))

        with tf.variable_scope('Accuracy'):
            # Accuracy
            predicts = tf.cast(tf.argmax(logits, 1), 'int32')
            corrects = tf.equal(predicts, answer)
            num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
            accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))

        # Training
        optimizer = tf.train.AdadeltaOptimizer(params.learning_rate)
        opt_op = optimizer.minimize(total_loss, global_step=self.global_step)

        # placeholders
        self.x = input
        self.q = question
        self.y = answer
        self.mask = input_mask
        self.is_training = is_training

        # tensors
        self.total_loss = total_loss
        self.num_corrects = num_corrects
        self.accuracy = accuracy
        self.opt_op = opt_op
示例#20
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.LayerNormBasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size)

        #self.cell = cell = tf.nn.rnn_cell.MultiRNNCell([cell] * args.num_layers) #changed

        self.cell = cell  #tf.nn.rnn_cell.BasicRNNCell([cell] * args.num_layers)
        #self.cell = rnn_cell.BasicRNNCell([cell] * args.num_layers)

        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)
        self.batch_pointer = tf.Variable(0,
                                         name="batch_pointer",
                                         trainable=False,
                                         dtype=tf.int32)
        self.inc_batch_pointer_op = tf.assign(self.batch_pointer,
                                              self.batch_pointer + 1)
        self.epoch_pointer = tf.Variable(0,
                                         name="epoch_pointer",
                                         trainable=False)
        self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False)
        tf.summary.scalar("time_batch", self.batch_time)

        def variable_summaries(var):
            """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
            with tf.name_scope('summaries'):
                mean = tf.reduce_mean(var)
                tf.summary.scalar('mean', mean)
                #with tf.name_scope('stddev'):
                #   stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
                #tf.summary.scalar('stddev', stddev)
                tf.summary.scalar('max', tf.reduce_max(var))
                tf.summary.scalar('min', tf.reduce_min(var))
                #tf.summary.histogram('histogram', var)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            variable_summaries(softmax_w)
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            variable_summaries(softmax_b)
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding",
                                            [args.vocab_size, args.rnn_size])
                inputs = tf.split(axis=1,
                                  num_or_size_splits=args.seq_length,
                                  value=tf.nn.embedding_lookup(
                                      embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope='rnnlm')
        output = tf.reshape(tf.concat(axis=1, values=outputs),
                            [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        tf.summary.scalar("cost", self.cost)
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
示例#21
0
    def __init__(self, args, embedding):
        self.args = args

        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length],
                                         name='STAND_input')
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length],
                                      name='STAND_targets')
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)
        self.embedding = embedding
        with tf.variable_scope('STAND'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            inputs = tf.split(
                1, args.seq_length,
                tf.nn.embedding_lookup(self.embedding, self.input_data))
            inputs = map(lambda i: tf.nn.l2_normalize(i, 1),
                         [tf.squeeze(input_, [1]) for input_ in inputs])

        def loop(prev, i):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.l2_normalize(
                tf.nn.embedding_lookup(embedding, prev_symbol), 1)

        o, _ = seq2seq.rnn_decoder(inputs,
                                   self.initial_state,
                                   cell,
                                   loop_function=None,
                                   scope='STAND')
        with tf.variable_scope('STAND', reuse=True) as scope:
            sf_o, _ = seq2seq.rnn_decoder(inputs,
                                          self.initial_state,
                                          cell,
                                          loop_function=loop,
                                          scope=scope)
        output = tf.reshape(tf.concat(1, o), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)

        sf_output = tf.reshape(tf.concat(1, sf_o), [-1, args.rnn_size])
        self_feed_logits = tf.matmul(sf_output, softmax_w) + softmax_b
        self.self_feed_probs = tf.nn.softmax(self_feed_logits)

        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        self.loss = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                          args.grad_clip)
        for g, v in zip(grads, tvars):
            print v.name
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
示例#22
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.rnncell == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.rnncell == 'gru':
            cell_fn = GRUCell
        elif args.rnncell == 'lstm':
            cell_fn = core_rnn_cell_impl.BasicLSTMCell
        else:
            raise Exception("rnncell type not supported: {}".format(
                args.rnncell))

        cell = cell_fn(args.rnn_size)
        self.cell = MultiRNNCell([cell] * args.num_layers)
        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = self.cell.zero_state(args.batch_size, tf.float32)
        self.attn_length = 5
        self.attn_size = 32
        self.attention_states = tf.placeholder(
            tf.float32, [args.batch_size, self.attn_length, self.attn_size])
        with tf.variable_scope('rnnlm'):
            softmax_w = build_weight([args.rnn_size, args.vocab_size],
                                     name='soft_w')
            softmax_b = build_weight([args.vocab_size], name='soft_b')
            self.word_embedding = build_weight(
                [args.vocab_size, args.embedding_size], name='word_embedding')
            inputs_list = tf.split(
                tf.nn.embedding_lookup(self.word_embedding, self.input_data),
                args.seq_length, 1)
            inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(self.word_embedding, prev_symbol)

        if not args.attention:
            outputs, last_state = seq2seq.rnn_decoder(
                inputs_list,
                self.initial_state,
                self.cell,
                loop_function=loop if infer else None,
                scope='rnnlm')
        else:
            outputs, last_state = attention_decoder(
                inputs_list,
                self.initial_state,
                self.attention_states,
                self.cell,
                loop_function=loop if infer else None,
                scope='rnnlm')

        self.final_state = last_state
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        # average loss for each word of each timestep
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.lr = tf.Variable(0.0, trainable=False)
        self.var_trainable_op = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost, self.var_trainable_op), args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(
            zip(grads, self.var_trainable_op))
        self.initial_op = tf.global_variables_initializer()
        self.logfile = args.log_dir + str(
            datetime.datetime.strftime(datetime.datetime.now(),
                                       '%Y-%m-%d %H:%M:%S') + '.txt').replace(
                                           ' ', '').replace('/', '')
        self.var_op = tf.global_variables()
        self.saver = tf.train.Saver(self.var_op,
                                    max_to_keep=4,
                                    keep_checkpoint_every_n_hours=1)
示例#23
0
    def build_graph(self, test):
        """
        Builds an graph in TensorFlow.
        """
        if test:
            self.batch_size = 1
            self.seq_len = 1
  ##
        # Cells
        ##

        lstm_cell = rnn_cell.BasicLSTMCell(self.cell_size)
        self.cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers)

        ##
        # Data
        ##

        # inputs and targets are 2D tensors of shape
        self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len])
        self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_len])
        self.initial_state = self.cell.zero_state(self.batch_size, tf.float32)

        ##
        # Variables
        ##
        with tf.variable_scope('lstm_vars'):
            self.ws = tf.get_variable('ws', [self.cell_size, self.vocab_size])
            self.bs = tf.get_variable('bs', [self.vocab_size])  # TODO: initializer?
            with tf.device('/cpu:0'): # put on CPU to parallelize for faster training/
                self.embeddings = tf.get_variable('embeddings', [self.vocab_size, self.cell_size])

                # get embeddings for all input words
                input_embeddings = tf.nn.embedding_lookup(self.embeddings, self.inputs)
                # The split splits this tensor into a seq_len long list of 3D tensors of shape
                # [batch_size, 1, rnn_size]. The squeeze removes the 1 dimension from the 1st axis
                # of each tensor
                inputs_split = tf.split(input_embeddings, self.seq_len, 1)
                inputs_split = [tf.squeeze(input_, [1]) for input_ in inputs_split]


     
        def loop(prev, _):
            prev = tf.matmul(prev, self.ws) + self.bs
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(self.embeddings, prev_symbol)

        lstm_outputs_split, self.final_state = seq2seq.rnn_decoder(inputs_split,
                                                                   self.initial_state,
                                                                   self.cell,
                                                                   loop_function=loop if test else None,
                                                                   scope='lstm_vars')
        lstm_outputs = tf.reshape(tf.concat(lstm_outputs_split, 1), [-1, self.cell_size])

        logits = tf.matmul(lstm_outputs, self.ws) + self.bs
        self.probs = tf.nn.softmax(logits)

        ##
        # Train
        ##

        total_loss = seq2seq.sequence_loss_by_example([logits],
                                                      [tf.reshape(self.targets, [-1])],
                                                      [tf.ones([self.batch_size * self.seq_len])],
                                                      self.vocab_size)
        self.loss = tf.reduce_sum(total_loss) / self.batch_size / self.seq_len

        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.optimizer = tf.train.AdamOptimizer(learning_rate=c.L_RATE, name='optimizer')
        self.train_op = self.optimizer.minimize(self.loss,
                                                global_step=self.global_step,
                                                name='train_op')
示例#24
0
    def __init__(self,
                 config,
                 decay_step,
                 is_training=False,
                 is_translate=False):

        # image means feed-in images: batch_size * img_size^2
        # label: labels of images not one hot representation
        self.config = config
        self.decay_step = decay_step
        self.is_training = is_training
        self.is_translate = is_translate

        # input data placeholders
        with tf.name_scope('input'):
            self.image = tf.placeholder(
                tf.float32,
                [None, config.input_img_size * config.input_img_size])
            self.label = tf.placeholder(tf.int64, [None])

        with tf.name_scope('image_translate'):
            # translate MNIST data if need
            if self.is_translate:
                img = tf.reshape(self.image, [
                    tf.shape(self.image)[0], config.input_img_size,
                    config.input_img_size, 1
                ],
                                 name='2D_2_4D')

                self.proc_image = self._translate_image(img)
                # reshape into 2D tensor: [batch_size, img_size^2]
                # new_img_size = self.proc_image.get_shape().as_list()
                # print(new_img_size)
                self.proc_image = tf.reshape(self.proc_image, [
                    tf.shape(self.image)[0], config.img_size * config.img_size
                ],
                                             name='4D_2_2D')
            else:
                self.proc_image = self.image

        with tf.name_scope('global_step'):
            self.global_step = tf.Variable(0, trainable=False)

        # define learning rate
        with tf.name_scope('learning_rate'):
            self.learning_rate = tf.maximum(
                tf.train.exponential_decay(config.learning_rate,
                                           self.global_step,
                                           decay_step,
                                           config.decay_factor,
                                           staircase=True),
                config.min_learning_rate)

            tf.summary.scalar("learning_rate", self.learning_rate)

        # Glimpse Network
        with tf.name_scope('glimpse_net'):
            self.glimpse_network = GlimpseNetwork(
                config=config, is_translate=self.is_translate)

        # Actor Network
        with tf.name_scope('actor_net'):
            self.actor_network = ActorNetwork(config=config,
                                              rnn_output_size=config.cell_size,
                                              is_sampling=self.is_training)

        # LSTM Network
        with tf.name_scope('lstm'):
            cell = BasicLSTMCell(config.cell_size, name='basic_lstm_cell')

            with tf.name_scope('initialization'):
                with tf.name_scope('batch_size'):
                    batch_size = tf.shape(self.image)[0]

                with tf.name_scope('init_locs'):
                    init_locs = tf.random_uniform(
                        shape=[batch_size, config.loc_dim],
                        minval=-1,
                        maxval=1,
                        name='sampling')

                with tf.name_scope('init_state'):
                    init_state = cell.zero_state(batch_size, tf.float32)

                # transfer glimpse network output into 2D list
                # rnn_inputs: 3D list [[batch_size, 256], ...]
                with tf.name_scope('init_glimpse'):
                    init_glimpse = self.glimpse_network(
                        self.proc_image, init_locs)

                with tf.name_scope('rnn_inputs'):
                    rnn_inputs = [init_glimpse]
                    rnn_inputs.extend([0] * config.num_glimpses)

                with tf.name_scope('init_list'):
                    self.locs, self.loc_means, self.retina_reprsent = [], [], []

            # with tf.name_scope('rnn_decoder'):
            def loop_function(prev, _):
                loc, loc_mean = self.actor_network(prev)
                self.locs.append(loc)
                self.loc_means.append(loc_mean)
                glimpse = self.glimpse_network(self.proc_image, loc)
                self.retina_reprsent.append(
                    self.glimpse_network.retina_sensor.retina_reprsent)
                return glimpse

            self.rnn_outputs, _ = rnn_decoder(rnn_inputs,
                                              init_state,
                                              cell,
                                              loop_function=loop_function)

        # Critic Network
        with tf.name_scope('critic_net'):
            self.critic_network = CriticNetwork(
                config=config, rnn_output_size=cell.output_size)

        # Classify Network
        with tf.name_scope('classify_net'):
            self.classify_network = ClassifyNetwork(
                config=config, rnn_output_size=cell.output_size)

        rnn_last_output = self.rnn_outputs[-1]
        self.logits = self.classify_network(rnn_last_output)
        with tf.name_scope('argmax'):
            self.prediction = tf.argmax(self.logits, 1)  # [batch_size]
        with tf.name_scope('softmax'):
            self.softmax = tf.nn.softmax(self.logits)

        if is_training:
            # hybrid loss: classification loss, RL reward, baseline loss
            with tf.name_scope('total_loss'):
                self.loss = self.total_loss()
                tf.summary.scalar("total_loss", self.loss)

            with tf.name_scope('train'):
                var_list = tf.trainable_variables()
                gradients = tf.gradients(self.loss, var_list)

                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, config.max_gradient_norm)
                self.train_op = tf.train.AdamOptimizer(
                    self.learning_rate).apply_gradients(
                        zip(clipped_gradients, var_list),
                        global_step=self.global_step)

            with tf.name_scope('merge'):
                self.merged = tf.summary.merge_all()