def __init__(self, state_shape: Sequence[int], n_actions: int, n_hidden_units: int, n_hidden_layers: int) -> None: super(ActorCriticNetworkDiscrete, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.states = tf.placeholder(tf.float32, [None] + list(state_shape), name="states") self.actions_taken = tf.placeholder(tf.float32, [None, n_actions], name="actions_taken") x = self.states for i in range(n_hidden_layers): x = tf.tanh(linear(x, n_hidden_units, "L{}_action".format(i + 1), initializer=normalized_columns_initializer(1.0))) self.logits = linear(x, n_actions, "actionlogits", normalized_columns_initializer(0.01)) x = self.states for i in range(n_hidden_layers): x = tf.tanh(linear(x, n_hidden_units, "L{}_value".format(i + 1), initializer=normalized_columns_initializer(1.0))) self.value = tf.reshape(linear(x, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] # Log probabilities of all actions self.log_probs = tf.nn.log_softmax(self.logits) # Prob of the action that was actually taken self.action_log_prob = tf.reduce_sum(self.log_probs * self.actions_taken, [1]) self.entropy = self.probs * self.log_probs
def __init__(self, state_shape: Sequence[int], n_actions: int, n_hidden: int, lstm_size: int = 256, summary: bool = True) -> None: super(ActorCriticNetworkDiscreteCNNRNN, self).__init__() self.state_shape: Sequence[int] = state_shape self.n_actions: int = n_actions self.n_hidden: int = n_hidden self.summary: bool = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.actions_taken = tf.placeholder(tf.float32, name="actions_taken") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten reshape = tf.expand_dims(flatten(x), [0]) self.enc_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size) lstm_state_size = self.enc_cell.state_size c_init = np.zeros((1, lstm_state_size.c), np.float32) h_init = np.zeros((1, lstm_state_size.h), np.float32) self.state_init = [c_init, h_init] self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32) tf.add_to_collection("rnn_state_in_c", self.rnn_state_in.c) tf.add_to_collection("rnn_state_in_h", self.rnn_state_in.h) L3, self.rnn_state_out = tf.nn.dynamic_rnn(cell=self.enc_cell, inputs=reshape, initial_state=self.rnn_state_in, dtype=tf.float32) tf.add_to_collection("rnn_state_out_c", self.rnn_state_out.c) tf.add_to_collection("rnn_state_out_h", self.rnn_state_out.h) L3 = tf.reshape(L3, [-1, lstm_size]) # Fully connected for actor and critic self.logits = linear(L3, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape(linear(L3, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] # Log probabilities of all actions self.log_probs = tf.nn.log_softmax(self.logits) # Prob of the action that was actually taken self.action_log_prob = tf.reduce_sum(self.log_probs * self.actions_taken, [1]) self.entropy = self.probs * self.log_probs
def __init__(self, state_shape: Sequence[int], action_space, n_hidden_units: int, n_hidden_layers: int = 1) -> None: super(ActorCriticNetworkContinuous, self).__init__() self.state_shape = state_shape self.states = tf.placeholder("float", [None] + list(state_shape), name="states") self.actions_taken = tf.placeholder(tf.float32, [None] + list(action_space.shape), name="actions_taken") x = self.states for i in range(n_hidden_layers): x = tf.tanh( linear(x, n_hidden_units, "L{}_mean".format(i + 1), initializer=normalized_columns_initializer(1.0))) self.mean = linear(x, action_space.shape[0], "mean", initializer=normalized_columns_initializer(0.01)) self.mean = tf.check_numerics(self.mean, "mean") self.log_std = tf.get_variable(name="logstd", shape=list(action_space.shape), initializer=tf.zeros_initializer()) std = tf.exp(self.log_std, name="std") std = tf.check_numerics(std, "std") self.action = self.mean + std * tf.random_normal(tf.shape(self.mean)) self.action = tf.reshape(self.action, list(action_space.shape)) x = self.states for i in range(n_hidden_layers): x = tf.tanh( linear(x, n_hidden_units, "L{}_value".format(i + 1), initializer=normalized_columns_initializer(1.0))) self.value = tf.reshape( linear(x, 1, "value", normalized_columns_initializer(1.0)), [-1]) neglogprob = 0.5 * tf.reduce_sum(tf.square((self.actions_taken - self.mean) / std), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(self.actions_taken)[-1]) \ + tf.reduce_sum(self.log_std, axis=-1) self.action_log_prob = -neglogprob self.entropy = -tf.reduce_sum( self.log_std + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
def __init__(self, state_shape, n_actions, n_hidden, summary=True): super(ActorCriticNetworkDiscreteCNN, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden self.summary = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.adv = tf.placeholder(tf.float32, name="advantage") self.actions_taken = tf.placeholder(tf.float32, [None, n_actions], name="actions_taken") self.r = tf.placeholder(tf.float32, [None], name="r") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten shape = x.get_shape().as_list() reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3] ]) # -1 for the (unknown) batch size # Fully connected for Actor & Critic self.logits = linear(reshape, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape( linear(reshape, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] log_probs = tf.nn.log_softmax(self.logits) self.actor_loss = -tf.reduce_sum( tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv) self.critic_loss = 0.5 * tf.reduce_sum(tf.square(self.value - self.r)) entropy = -tf.reduce_sum(self.probs * log_probs) self.loss = self.actor_loss + 0.5 * self.critic_loss - entropy * 0.01 self.summary_loss = self.loss self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, state_shape: Sequence[int], n_actions: int, n_hidden: int, summary: bool = True) -> None: super(ActorCriticNetworkDiscreteCNN, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden self.summary = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.actions_taken = tf.placeholder(tf.float32, [None, n_actions], name="actions_taken") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten shape = x.get_shape().as_list() # -1 for the (unknown) batch size reshape = tf.reshape(x, [-1, shape[1] * shape[2] * shape[3]]) # Fully connected for actor & critic self.logits = linear(reshape, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape( linear(reshape, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keepdims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] # Log probabilities of all actions self.log_probs = tf.nn.log_softmax(self.logits) # Prob of the action that was actually taken self.action_log_prob = tf.reduce_sum( self.log_probs * self.actions_taken, [1]) self.entropy = self.probs * self.log_probs
def __init__(self, state_shape, n_hidden, summary=True): super(CriticNetwork, self).__init__() self.state_shape = state_shape self.n_hidden = n_hidden with tf.variable_scope("critic"): self.states = tf.placeholder("float", [None] + self.state_shape, name="states") self.r = tf.placeholder(tf.float32, [None], name="r") L1 = tf.contrib.layers.fully_connected( inputs=self.states, num_outputs=self.n_hidden, activation_fn=tf.tanh, weights_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer(), scope="L1") self.value = tf.reshape( linear(L1, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.loss = tf.reduce_sum(tf.square(self.value - self.r)) self.summary_loss = self.loss self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, state_shape, n_actions, n_hidden, summary=True): super(ActorNetworkDiscrete, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden with tf.variable_scope("actor"): self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.adv = tf.placeholder(tf.float32, name="advantage") self.actions_taken = tf.placeholder(tf.float32, [None, n_actions], name="actions_taken") L1 = tf.contrib.layers.fully_connected( inputs=self.states, num_outputs=self.n_hidden, activation_fn=tf.tanh, weights_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer(), scope="L1") # Fully connected for Actor & Critic self.logits = linear(L1, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] log_probs = tf.nn.log_softmax(self.logits) self.loss = -tf.reduce_sum( tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv) self.summary_loss = self.loss # Loss to show as a summary self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
def __init__(self, state_shape, n_actions, n_hidden, summary=True): super(ActorCriticNetworkDiscreteCNNRNN, self).__init__() self.state_shape = state_shape self.n_actions = n_actions self.n_hidden = n_hidden self.summary = summary self.states = tf.placeholder(tf.float32, [None] + state_shape, name="states") self.adv = tf.placeholder(tf.float32, name="advantage") self.actions_taken = tf.placeholder(tf.float32, name="actions_taken") self.r = tf.placeholder(tf.float32, [None], name="r") x = self.states # Convolution layers for i in range(4): x = tf.nn.elu(conv2d(x, 32, "l{}".format(i + 1), [3, 3], [2, 2])) # Flatten reshape = tf.expand_dims(flatten(x), [0]) lstm_size = 256 self.enc_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size) lstm_state_size = self.enc_cell.state_size c_init = np.zeros((1, lstm_state_size.c), np.float32) h_init = np.zeros((1, lstm_state_size.h), np.float32) self.state_init = [c_init, h_init] self.rnn_state_in = self.enc_cell.zero_state(1, tf.float32) tf.add_to_collection("rnn_state_in_c", self.rnn_state_in.c) tf.add_to_collection("rnn_state_in_h", self.rnn_state_in.h) L3, self.rnn_state_out = tf.nn.dynamic_rnn( cell=self.enc_cell, inputs=reshape, initial_state=self.rnn_state_in, dtype=tf.float32) tf.add_to_collection("rnn_state_out_c", self.rnn_state_out.c) tf.add_to_collection("rnn_state_out_h", self.rnn_state_out.h) L3 = tf.reshape(L3, [-1, lstm_size]) # Fully connected for Actor self.logits = linear(L3, n_actions, "actionlogits", normalized_columns_initializer(0.01)) self.value = tf.reshape( linear(L3, 1, "value", normalized_columns_initializer(1.0)), [-1]) self.probs = tf.nn.softmax(self.logits) self.action = tf.squeeze(tf.multinomial( self.logits - tf.reduce_max(self.logits, [1], keep_dims=True), 1), [1], name="action") self.action = tf.one_hot(self.action, n_actions)[0, :] log_probs = tf.nn.log_softmax(self.logits) self.actor_loss = -tf.reduce_sum( tf.reduce_sum(log_probs * self.actions_taken, [1]) * self.adv) self.critic_loss = 0.5 * tf.reduce_sum(tf.square(self.value - self.r)) self.entropy = -tf.reduce_sum(self.probs * log_probs) self.loss = self.actor_loss + 0.5 * self.critic_loss - self.entropy * 0.01 self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)