def __init__(self, q_values, observations, num_actions, stochastic, eps, softmax, softmax_temp): if softmax: action_dist = Categorical(q_values / softmax_temp) self.action = action_dist.sample() self.action_prob = action_dist.sampled_action_prob() return deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations)[0] # Special case masked out actions (q_value ~= -inf) so that we don't # even consider them for exploration. random_valid_action_logits = tf.where( tf.equal(q_values, tf.float32.min), tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values)) random_actions = tf.squeeze(tf.multinomial(random_valid_action_logits, 1), axis=1) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) self.action = tf.cond(stochastic, lambda: stochastic_actions, lambda: deterministic_actions) self.action_prob = None
def __init__(self, q_values, observations, num_actions, stochastic, eps, softmax, softmax_temp): if softmax: action_dist = Categorical(q_values / softmax_temp) self.action = action_dist.sample() self.action_prob = action_dist.sampled_action_prob() return deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations)[0] # Special case masked out actions (q_value ~= -inf) so that we don't # even consider them for exploration. random_valid_action_logits = tf.where( tf.equal(q_values, tf.float32.min), tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values)) random_actions = tf.squeeze( tf.multinomial(random_valid_action_logits, 1), axis=1) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) self.action = tf.cond(stochastic, lambda: stochastic_actions, lambda: deterministic_actions) self.action_prob = None
def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader(self.options["custom_options"]["input_files"]) input_ops = reader.tf_input_ops() # define a secondary loss by building a graph copy with weight sharing with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE, auxiliary_name_scope=False): logits, _ = self._build_layers_v2( { "obs": restore_original_dimensions(input_ops["obs"], self.obs_space) }, self.num_outputs, self.options) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss action_dist = Categorical(logits) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss
def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader(self.options["custom_options"]["input_files"]) input_ops = reader.tf_input_ops() # define a secondary loss by building a graph copy with weight sharing logits, _ = self._build_layers_v2({ "obs": restore_original_dimensions(input_ops["obs"], self.obs_space) }, self.num_outputs, self.options) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss action_dist = Categorical(logits) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss