예제 #1
0
    def __init__(self,
                 state_shape: Sequence[int],
                 n_actions: int,
                 n_hidden: int,
                 lstm_size: int = 256,
                 summary: bool = True) -> None:
        super(ActorCriticNetworkDiscreteCNNRNN, self).__init__()
        self.state_shape: Sequence[int] = state_shape
        self.n_actions: int = n_actions
        self.n_hidden: int = n_hidden
        self.summary: bool = summary

        self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states")
        self.actions_taken = tf.placeholder(tf.float32, name="actions_taken")

        x = self.states
        # Convolution layers
        for i in range(4):
            x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))

        # Flatten
        reshape = tf.expand_dims(flatten(x), [0])

        self.enc_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        lstm_state_size = self.enc_cell.state_size
        c_init = np.zeros((1, lstm_state_size.c), np.float32)
        h_init = np.zeros((1, lstm_state_size.h), np.float32)
        self.state_init = [c_init, h_init]
        self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32)
        tf.add_to_collection("rnn_state_in_c", self.rnn_state_in.c)
        tf.add_to_collection("rnn_state_in_h", self.rnn_state_in.h)
        L3, self.rnn_state_out = tf.nn.dynamic_rnn(cell=self.enc_cell,
                                                   inputs=reshape,
                                                   initial_state=self.rnn_state_in,
                                                   dtype=tf.float32)
        tf.add_to_collection("rnn_state_out_c", self.rnn_state_out.c)
        tf.add_to_collection("rnn_state_out_h", self.rnn_state_out.h)
        L3 = tf.reshape(L3, [-1, lstm_size])

        # Fully connected for actor and critic
        self.logits = linear(L3, n_actions, "actionlogits", normalized_columns_initializer(0.01))
        self.value = tf.reshape(linear(L3, 1, "value", normalized_columns_initializer(1.0)), [-1])

        self.probs = tf.nn.softmax(self.logits)

        self.action = tf.squeeze(tf.multinomial(
            self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1), [1], name="action")
        self.action = tf.one_hot(self.action, n_actions)[0, :]

        # Log probabilities of all actions
        self.log_probs = tf.nn.log_softmax(self.logits)
        # Prob of the action that was actually taken
        self.action_log_prob = tf.reduce_sum(self.log_probs * self.actions_taken, [1])

        self.entropy = self.probs * self.log_probs
예제 #2
0
    def __init__(self, state_shape, n_actions, n_hidden, summary=True):
        super(ActorCriticNetworkDiscreteCNN, self).__init__()
        self.state_shape = state_shape
        self.n_actions = n_actions
        self.n_hidden = n_hidden
        self.summary = summary

        self.states = tf.placeholder(tf.float32, [None] + state_shape,
                                     name="states")
        self.adv = tf.placeholder(tf.float32, name="advantage")
        self.actions_taken = tf.placeholder(tf.float32, [None, n_actions],
                                            name="actions_taken")
        self.r = tf.placeholder(tf.float32, [None], name="r")

        x = self.states
        # Convolution layers
        for i in range(4):
            x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))

        # Flatten
        shape = x.get_shape().as_list()
        reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]
                                 ])  # -1 for the (unknown) batch size

        # Fully connected for Actor & Critic
        self.logits = linear(reshape, n_actions, "actionlogits",
                             normalized_columns_initializer(0.01))
        self.value = tf.reshape(
            linear(reshape, 1, "value", normalized_columns_initializer(1.0)),
            [-1])

        self.probs = tf.nn.softmax(self.logits)

        self.action = tf.squeeze(tf.multinomial(
            self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1),
                                 [1],
                                 name="action")
        self.action = tf.one_hot(self.action, n_actions)[0, :]

        log_probs = tf.nn.log_softmax(self.logits)
        self.actor_loss = -tf.reduce_sum(
            tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv)

        self.critic_loss = 0.5 * tf.reduce_sum(tf.square(self.value - self.r))

        entropy = -tf.reduce_sum(self.probs * log_probs)

        self.loss = self.actor_loss + 0.5 * self.critic_loss - entropy * 0.01
        self.summary_loss = self.loss

        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      tf.get_variable_scope().name)
예제 #3
0
    def __init__(self,
                 state_shape: Sequence[int],
                 n_actions: int,
                 n_hidden: int,
                 summary: bool = True) -> None:
        super(ActorCriticNetworkDiscreteCNN, self).__init__()
        self.state_shape = state_shape
        self.n_actions = n_actions
        self.n_hidden = n_hidden
        self.summary = summary

        self.states = tf.placeholder(tf.float32, [None] + state_shape,
                                     name="states")
        self.actions_taken = tf.placeholder(tf.float32, [None, n_actions],
                                            name="actions_taken")

        x = self.states
        # Convolution layers
        for i in range(4):
            x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))

        # Flatten
        shape = x.get_shape().as_list()
        # -1 for the (unknown) batch size
        reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]])

        # Fully connected for actor & critic
        self.logits = linear(reshape, n_actions, "actionlogits",
                             normalized_columns_initializer(0.01))
        self.value = tf.reshape(
            linear(reshape, 1, "value", normalized_columns_initializer(1.0)),
            [-1])

        self.probs = tf.nn.softmax(self.logits)

        self.action = tf.squeeze(tf.multinomial(
            self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1),
                                 [1],
                                 name="action")
        self.action = tf.one_hot(self.action, n_actions)[0, :]

        # Log probabilities of all actions
        self.log_probs = tf.nn.log_softmax(self.logits)
        # Prob of the action that was actually taken
        self.action_log_prob = tf.reduce_sum(
            self.log_probs * self.actions_taken, [1])

        self.entropy = self.probs * self.log_probs
예제 #4
0
    def build_network(self):
        self.rnn_state = None
        self.a_n = tf.placeholder(tf.float32, name="a_n")  # Discrete action
        self.adv_n = tf.placeholder(tf.float32, name="adv_n")  # Advantage

        image_size = 80
        image_depth = 1  # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images

        self.states = tf.placeholder(
            tf.float32, [None, image_size, image_size, image_depth],
            name="states")
        self.N = tf.placeholder(tf.int32, name="N")

        x = self.states
        # Convolution layers
        for i in range(4):
            x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))

        # Flatten
        shape = x.get_shape().as_list()
        reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]
                                 ])  # -1 for the (unknown) batch size

        reshape = tf.expand_dims(flatten(reshape), [0])
        self.enc_cell = tf.contrib.rnn.BasicLSTMCell(
            self.config["n_hidden_units"])
        self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32)
        self.L3, self.rnn_state_out = tf.nn.dynamic_rnn(
            cell=self.enc_cell,
            inputs=reshape,
            initial_state=self.rnn_state_in,
            dtype=tf.float32)

        self.probs = tf.contrib.layers.fully_connected(
            inputs=self.L3[0],
            num_outputs=self.env_runner.nA,
            activation_fn=tf.nn.softmax,
            weights_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.02),
            biases_initializer=tf.zeros_initializer())
        self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1),
                                 name="action")
예제 #5
0
    def build_network(self):
        image_size = 80
        image_depth = 1  # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images

        self.states = tf.placeholder(
            tf.float32, [None, image_size, image_size, image_depth],
            name="states")
        self.a_n = tf.placeholder(tf.float32, name="a_n")
        self.N = tf.placeholder(tf.int32, name="N")
        self.adv_n = tf.placeholder(tf.float32, name="adv_n")  # Advantage

        x = self.states
        # Convolution layers
        for i in range(4):
            x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))

        # Flatten
        shape = x.get_shape().as_list()
        reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]
                                 ])  # -1 for the (unknown) batch size

        # Fully connected layer 1
        self.L3 = tf.contrib.layers.fully_connected(
            inputs=reshape,
            num_outputs=self.config["n_hidden_units"],
            activation_fn=tf.nn.relu,
            weights_initializer=tf.random_normal_initializer(stddev=0.01),
            biases_initializer=tf.zeros_initializer())

        # Fully connected layer 2
        self.probs = tf.contrib.layers.fully_connected(
            inputs=self.L3,
            num_outputs=self.env_runner.nA,
            activation_fn=tf.nn.softmax,
            weights_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.02),
            biases_initializer=tf.zeros_initializer())

        self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1),
                                 name="action")
예제 #6
0
    def __init__(self, state_shape, n_actions, n_hidden, summary=True):
        super(ActorCriticNetworkDiscreteCNNRNN, self).__init__()
        self.state_shape = state_shape
        self.n_actions = n_actions
        self.n_hidden = n_hidden
        self.summary = summary

        self.states = tf.placeholder(tf.float32, [None] + state_shape,
                                     name="states")
        self.adv = tf.placeholder(tf.float32, name="advantage")
        self.actions_taken = tf.placeholder(tf.float32, name="actions_taken")
        self.r = tf.placeholder(tf.float32, [None], name="r")

        x = self.states
        # Convolution layers
        for i in range(4):
            x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2]))

        # Flatten
        reshape = tf.expand_dims(flatten(x), [0])

        lstm_size = 256
        self.enc_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        lstm_state_size = self.enc_cell.state_size
        c_init = np.zeros((1, lstm_state_size.c), np.float32)
        h_init = np.zeros((1, lstm_state_size.h), np.float32)
        self.state_init = [c_init, h_init]
        self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32)
        tf.add_to_collection("rnn_state_in_c", self.rnn_state_in.c)
        tf.add_to_collection("rnn_state_in_h", self.rnn_state_in.h)
        L3, self.rnn_state_out = tf.nn.dynamic_rnn(
            cell=self.enc_cell,
            inputs=reshape,
            initial_state=self.rnn_state_in,
            dtype=tf.float32)
        tf.add_to_collection("rnn_state_out_c", self.rnn_state_out.c)
        tf.add_to_collection("rnn_state_out_h", self.rnn_state_out.h)
        L3 = tf.reshape(L3, [-1, lstm_size])

        # Fully connected for Actor

        self.logits = linear(L3, n_actions, "actionlogits",
                             normalized_columns_initializer(0.01))
        self.value = tf.reshape(
            linear(L3, 1, "value", normalized_columns_initializer(1.0)), [-1])

        self.probs = tf.nn.softmax(self.logits)

        self.action = tf.squeeze(tf.multinomial(
            self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1),
                                 [1],
                                 name="action")
        self.action = tf.one_hot(self.action, n_actions)[0, :]

        log_probs = tf.nn.log_softmax(self.logits)
        self.actor_loss = -tf.reduce_sum(
            tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv)

        self.critic_loss = 0.5 * tf.reduce_sum(tf.square(self.value - self.r))

        self.entropy = -tf.reduce_sum(self.probs * log_probs)

        self.loss = self.actor_loss + 0.5 * self.critic_loss - self.entropy * 0.01

        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      tf.get_variable_scope().name)