def getLoss(self, states, actions, rewards, entropy_scale=0.01, policy_scale=1.0, **kwargs): n = self.rlConfig.tdN state_shape = tf.shape(states) state_rank = tf.shape(state_shape)[0] experience_length = tf.gather(state_shape, state_rank-2) train_length = experience_length - n values, actor_probs = self.getOutput(states) trainVs = tf.slice(values, [0, 0], [-1, train_length]) #trainVs = values[:,:train_length] # smooth between TD(m) for m<=n? targets = tf.slice(values, [0, n], [-1, train_length]) #targets = values[:,n:] for i in reversed(range(n)): targets *= self.rlConfig.discount targets += tf.slice(rewards, [0, i], [-1, train_length]) targets = tf.stop_gradient(targets) advantages = targets - trainVs vLoss = tf.reduce_mean(tf.square(advantages)) log_actor_probs = tf.log(actor_probs) actor_entropy = -tf.reduce_mean(tfl.batch_dot(actor_probs, log_actor_probs)) real_log_actor_probs = tfl.batch_dot(actions, log_actor_probs) train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length]) actor_gain = tf.reduce_mean(tf.mul(train_log_actor_probs, tf.stop_gradient(advantages))) acLoss = vLoss - policy_scale * (actor_gain + entropy_scale * actor_entropy) return acLoss, [('vLoss', vLoss), ('actor_gain', actor_gain), ('actor_entropy', actor_entropy)]
def getLoss(self, states, actions, rewards, sarsa=False, **kwargs): "Negative Log-Likelihood" n = self.rlConfig.tdN train_length = [config.experience_length - n] qMeans, qLogVariances = self.getQDists(states) realQs = tfl.batch_dot(actions, qMeans) maxQs = tf.reduce_max(qMeans, 1) # smooth between TD(m) for m<=n? targets = tf.slice(realQs if sarsa else maxQs, [n], train_length) for i in reversed(range(n)): targets = tf.slice(rewards, [i], train_length) + self.rlConfig.discount * targets targets = tf.stop_gradient(targets) trainQs = tf.slice(realQs, [0], train_length) realLogVariances = tfl.batch_dot(actions, qLogVariances) trainLogVariances = tf.slice(realLogVariances, [0], train_length) nlls = tf.squared_difference(trainQs, targets) * tf.exp(-trainLogVariances) + trainLogVariances nll = tf.reduce_mean(nlls) return nll, [("nll", nll)]
def getLoss(self, states, actions, rewards, entropy_scale=0.01, policy_scale=1.0, **kwargs): n = self.rlConfig.tdN state_shape = tf.shape(states) state_rank = tf.shape(state_shape)[0] experience_length = tf.gather(state_shape, state_rank - 2) train_length = experience_length - n values = self.critic(states) log_actor_probs = self.actor(states) actor_probs = tf.exp(log_actor_probs) trainVs = tf.slice(values, [0, 0], [-1, train_length]) #trainVs = values[:,:train_length] # smooth between TD(m) for m<=n? targets = tf.slice(values, [0, n], [-1, train_length]) #targets = values[:,n:] for i in reversed(range(n)): targets *= self.rlConfig.discount targets += tf.slice(rewards, [0, i], [-1, train_length]) targets = tf.stop_gradient(targets) advantages = targets - trainVs vLoss = tf.reduce_mean(tf.square(advantages)) actor_entropy = -tf.reduce_mean( tfl.batch_dot(actor_probs, log_actor_probs)) real_log_actor_probs = tfl.batch_dot(actions, log_actor_probs) train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length]) actor_gain = tf.reduce_mean( tf.mul(train_log_actor_probs, tf.stop_gradient(advantages))) self.policy_scale = tf.Variable(policy_scale) min_rate = 1e-8 max_rate = 1e2 self.decrease_policy_scale = tf.assign( self.policy_scale, tf.maximum(min_rate, self.policy_scale / 1.5)) self.increase_policy_scale = tf.assign( self.policy_scale, tf.minimum(max_rate, self.policy_scale * 1.5)) acLoss = vLoss - self.policy_scale * (actor_gain + entropy_scale * actor_entropy) stats = [('vLoss', vLoss), ('actor_gain', actor_gain), ('actor_entropy', actor_entropy), ('policy_scale', tf.log(self.policy_scale))] return acLoss, stats, log_actor_probs
def train(self, states, actions, rewards): n = self.rlConfig.tdN state_shape = tf.shape(states) state_rank = tf.shape(state_shape)[0] experience_length = tf.gather(state_shape, state_rank-2) train_length = experience_length - n values = self.critic(states) actor_probs = self.actor(states) log_actor_probs = tf.log(actor_probs) trainVs = tf.slice(values, [0, 0], [-1, train_length]) #trainVs = values[:,:train_length] # smooth between TD(m) for m<=n? targets = tf.slice(values, [0, n], [-1, train_length]) #targets = values[:,n:] for i in reversed(range(n)): targets *= self.rlConfig.discount targets += tf.slice(rewards, [0, i], [-1, train_length]) targets = tf.stop_gradient(targets) advantages = targets - trainVs vLoss = tf.reduce_mean(tf.square(advantages)) tf.scalar_summary('v_loss', vLoss) variance = tf.reduce_mean(tf.squared_difference(targets, tf.reduce_mean(targets))) explained_variance = 1. - vLoss / variance tf.scalar_summary("v_ev", explained_variance) actor_entropy = -tf.reduce_mean(tfl.batch_dot(actor_probs, log_actor_probs)) tf.scalar_summary('actor_entropy', actor_entropy) real_log_actor_probs = tfl.batch_dot(actions, log_actor_probs) train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length]) actor_gain = tf.reduce_mean(tf.mul(train_log_actor_probs, tf.stop_gradient(advantages))) tf.scalar_summary('actor_gain', actor_gain) acLoss = vLoss - self.policy_scale * (actor_gain + self.entropy_scale * actor_entropy) params = tf.trainable_variables() pg = tf.gradients(acLoss, params, -self.learning_rate) if self.natural: predictions = [values, log_actor_probs] def metric(vp1, vp2): v1, p1 = vp1 v2, p2 = vp2 vDist = tf.reduce_mean(tf.squared_difference(v1, v2)) pDist = tf.reduce_mean(tfl.kl(p1, p2)) return vDist + self.kl_scale * pDist pg = self.natgrad(params, pg, predictions, metric) return tfl.apply_grads(params, pg)
def train(self, states, actions, rewards): n = self.rlConfig.tdN state_shape = tf.shape(states) state_rank = tf.shape(state_shape)[0] experience_length = tf.gather(state_shape, state_rank-2) train_length = experience_length - n self.predictedQs = self.q_net(states) trainQs = tfl.batch_dot(actions, self.predictedQs) trainQs = tf.slice(trainQs, [0, 0], [-1, train_length]) self.q_target = self.q_net#.clone() targetQs = self.q_target(states) realQs = tfl.batch_dot(actions, targetQs) maxQs = tf.reduce_max(targetQs, -1) targetQs = realQs if self.sarsa else maxQs tf.scalar_summary("q_mean", tf.reduce_mean(self.predictedQs)) tf.scalar_summary("q_max", tf.reduce_mean(maxQs)) # smooth between TD(m) for m<=n? targets = tf.slice(targetQs, [0, n], [-1, train_length]) for i in reversed(range(n)): targets = tf.slice(rewards, [0, i], [-1, train_length]) + self.rlConfig.discount * targets targets = tf.stop_gradient(targets) qLosses = tf.squared_difference(trainQs, targets) qLoss = tf.reduce_mean(qLosses) tf.scalar_summary("q_loss", qLoss) variance = tf.reduce_mean(tf.squared_difference(targets, tf.reduce_mean(targets))) explained_variance = 1 - qLoss / variance tf.scalar_summary("explained_variance", explained_variance) flatQs = tf.reshape(self.predictedQs, [-1, self.action_size]) action_probs = tf.nn.softmax(flatQs / self.temperature) action_probs = (1.0 - self.epsilon) * action_probs + self.epsilon / self.action_size entropy = -tf.reduce_sum(tf.log(action_probs) * action_probs, -1) entropy = tf.reduce_mean(entropy) tf.scalar_summary("entropy", entropy) self.params = tf.trainable_variables() self.gradients = tf.gradients(qLoss, self.params, -self.learning_rate) gradients = self.gradients if self.natural: def q_metric(q1, q2): return self.action_size * tf.reduce_mean(tf.squared_difference(q1, q2)) gradients = self.natgrad(self.params, gradients, self.predictedQs, q_metric) return tfl.apply_grads(self.params, gradients) """
def getLoss(self, states, actions, rewards, sarsa=False, target_delay=1000, **kwargs): n = self.rlConfig.tdN state_shape = tf.shape(states) state_rank = tf.shape(state_shape)[0] experience_length = tf.gather(state_shape, state_rank - 2) train_length = experience_length - n trainQs = self.q_net(states) trainQs = tfl.batch_dot(actions, trainQs) trainQs = tf.slice(trainQs, [0, 0], [-1, train_length]) self.q_target = self.q_net.clone() targetQs = self.q_target(states) realQs = tfl.batch_dot(actions, targetQs) maxQs = tf.reduce_max(targetQs, -1) targetQs = realQs if sarsa else maxQs # smooth between TD(m) for m<=n? targets = tf.slice(targetQs, [0, n], [-1, train_length]) for i in reversed(range(n)): targets = tf.slice( rewards, [0, i], [-1, train_length]) + self.rlConfig.discount * targets # not necessary if we optimize only on q_net variables # but this is easier :) targets = tf.stop_gradient(targets) qLosses = tf.squared_difference(trainQs, targets) qLoss = tf.reduce_mean(qLosses) update_target = lambda: tf.group(*self.q_target.assign(self.q_net), name="update_target") should_update = tf.equal(tf.mod(self.global_step, target_delay), 0) periodic_update = tf.case([(should_update, update_target)], default=lambda: tf.no_op()) #return qLoss, [("qLoss", qLoss)], (1000, update_target) return ( qLoss, [("qLoss", qLoss), ("periodic_update", periodic_update)], #tf.initialize_variables(self.q_target.getVariables()) )
def getLoss(self, states, actions, rewards, entropy_scale=0.001, policy_scale=0.01, **kwargs): n = self.rlConfig.tdN state_shape = tf.shape(states) batch_size = state_shape[0] experience_length = state_shape[1] train_length = experience_length - n initial_state = tf.expand_dims(self.initial_state, 0) initial_state = tf.tile(initial_state, tf.pack([batch_size, 1])) outputs, hidden = tf.nn.dynamic_rnn(self.rnn, states, initial_state=initial_state) values = self.critic(outputs) actor_probs = self.actor(outputs) trainVs = tf.slice(values, [0, 0], [-1, train_length]) #trainVs = values[:,:train_length] # smooth between TD(m) for m<=n? targets = tf.slice(values, [0, n], [-1, train_length]) #targets = values[:,n:] for i in reversed(range(n)): targets *= self.rlConfig.discount targets += tf.slice(rewards, [0, i], [-1, train_length]) targets = tf.stop_gradient(targets) advantages = targets - trainVs vLoss = tf.reduce_mean(tf.square(advantages)) log_actor_probs = tf.log(actor_probs) actor_entropy = -tf.reduce_mean(tfl.batch_dot(actor_probs, log_actor_probs)) real_log_actor_probs = tfl.batch_dot(actions, tf.log(actor_probs)) train_log_actor_probs = tf.slice(real_log_actor_probs, [0, 0], [-1, train_length]) actor_gain = tf.reduce_mean(tf.mul(train_log_actor_probs, tf.stop_gradient(advantages))) acLoss = vLoss - policy_scale * (actor_gain + entropy_scale * actor_entropy) return acLoss, [('vLoss', vLoss), ('actor_gain', actor_gain), ('actor_entropy', actor_entropy)]