예제 #1
0
    def __init__(self, sess, network=None):
        self.sess = sess
        self.hp = Hyperparameters()

        # input placeholder
        self.state = network[0][0]
        # self.action = network[0][1]
        self.target_value = network[0][1]
        # output
        self.value = network[1][0]
        self.td_error = network[1][1]
        self.loss = network[1][2]
        self.train_op = network[1][3]
예제 #2
0
def build_critic_network(lr=None,
                         n_stack=None,
                         image_size=None,
                         n_actions=None):
    """ Build the network for critic.
        """
    # init Hp
    hp = Hyperparameters()
    flag = hp.model
    if lr is None:
        lr = hp.LEARNING_RATE
    if n_stack is None:
        n_stack = hp.N_STACK
    if image_size is None:
        image_size = hp.IMAGE_SIZE
    if n_actions is None:
        n_actions = hp.N_ACTIONS

    state = tf.placeholder(tf.float32, [None, n_stack, image_size, image_size],
                           'state_' + flag)
    # action = tf.placeholder(tf.int32, [None, ], 'act_' + flag)
    target_value = tf.placeholder(tf.float32, [None, n_actions],
                                  'target_v_' + flag)
    # next_value = tf.placeholder(tf.float32, [None, ], 'v_next_' + flag)
    # reward = tf.placeholder(tf.float32, [None, ], 'r_'+flag)

    with tf.variable_scope('Critic_' + flag):
        input_crop = state / 255
        input = tf.transpose(input_crop, [0, 2, 3, 1])  # (?, 80, 80, 4)

        # tf.contrib.layers.conv2d(..., activation_fn=tf.nn.relu,...)
        conv1 = tf.contrib.layers.conv2d(inputs=input,
                                         num_outputs=32,
                                         kernel_size=8,
                                         stride=4)  # (?, 20, 20, 32)
        conv2 = tf.contrib.layers.conv2d(inputs=conv1,
                                         num_outputs=64,
                                         kernel_size=4,
                                         stride=2)  # (?, 10, 10, 64)
        conv3 = tf.contrib.layers.conv2d(inputs=conv2,
                                         num_outputs=64,
                                         kernel_size=3,
                                         stride=1)  # (?, 10, 10, 64)

        flat = tf.contrib.layers.flatten(conv3)
        f = tf.contrib.layers.fully_connected(flat, 512)
        value = tf.contrib.layers.fully_connected(f,
                                                  n_actions,
                                                  activation_fn=None)

    with tf.variable_scope('squared_TD_error_' + flag):
        # eval_value = tf.reduce_sum(value * tf.one_hot(action, n_actions), axis=1)
        # target_value = reward + hp.DISCOUNT_FACTOR * next_value
        # td_error = target_value - eval_value  # td_error shape (?,)
        # loss = tf.square(td_error)  # loss shape (?,)
        td_error = tf.reduce_sum(tf.subtract(target_value, value), axis=1)
        loss = tf.reduce_mean(tf.squared_difference(value, target_value))
    with tf.variable_scope('train_' + flag):
        train_op = tf.train.RMSPropOptimizer(lr).minimize(loss)

    return [[state, target_value], [value, td_error, loss, train_op]]
예제 #3
0
def build_actor_network(lr=None,
                        n_stack=None,
                        image_size=None,
                        n_actions=None):
    """ Build the network for actor.
        """
    # init Hp
    hp = Hyperparameters()
    flag = hp.model
    if lr is None:
        lr = hp.LEARNING_RATE
    if n_stack is None:
        n_stack = hp.N_STACK
    if image_size is None:
        image_size = hp.IMAGE_SIZE
    if n_actions is None:
        n_actions = hp.N_ACTIONS

    state = tf.placeholder(tf.float32, [None, n_stack, image_size, image_size],
                           'state_' + flag)
    action = tf.placeholder(tf.int32, [
        None,
    ], 'act_' + flag)
    td_error = tf.placeholder(tf.float32, [
        None,
    ], 'td_error_' + flag)  # TD_error

    with tf.variable_scope('Actor'):
        input_crop = state / 255
        input = tf.transpose(input_crop, [0, 2, 3, 1])  # (?, 80, 80, 4)

        conv1 = tf.nn.relu(
            tf.contrib.layers.conv2d(inputs=input,
                                     num_outputs=32,
                                     kernel_size=8,
                                     stride=4))  # (?, 20, 20, 32)
        conv2 = tf.nn.relu(
            tf.contrib.layers.conv2d(inputs=conv1,
                                     num_outputs=64,
                                     kernel_size=4,
                                     stride=2))  # (?, 10, 10, 64)
        conv3 = tf.nn.relu(
            tf.contrib.layers.conv2d(inputs=conv2,
                                     num_outputs=64,
                                     kernel_size=3,
                                     stride=1))  # (?, 10, 10, 64)

        flat = tf.contrib.layers.flatten(conv3)
        f = tf.contrib.layers.fully_connected(flat, 512)
        acts_prob = tf.contrib.layers.fully_connected(f, n_actions)
        acts_prob = tf.nn.softmax(
            acts_prob)  # use softmax to convert to probability

    with tf.variable_scope('exp_v_' + flag):
        prob = -tf.log(tf.clip_by_value(acts_prob, 1e-10, 1.0)) * tf.one_hot(
            action, n_actions)
        log_prob = tf.reduce_sum(prob, axis=1)
        exp_v = tf.reduce_mean(log_prob *
                               td_error)  # advantage (TD_error) guided loss

    with tf.variable_scope('train_' + flag):
        train_op = tf.train.RMSPropOptimizer(lr).minimize(
            exp_v)  # minimize(-exp_v) = maximize(exp_v)

    return [[state, action, td_error], [acts_prob, exp_v, train_op]]