def _build_graph(self, inputs): state, action, futurereward, action_prob = inputs logits, value = self._get_NN_prediction(state) value = tf.squeeze(value, [1], name='pred_value') # (B,) policy = tf.nn.softmax(logits, name='policy') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(policy + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, NUM_ACTIONS), 1) advantage = tf.subtract(tf.stop_gradient(value), futurereward, name='advantage') pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1) # (B,) importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10)) policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage * importance, name='policy_loss') xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss]) self.cost = tf.truediv(self.cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost') summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage, self.cost, tf.reduce_mean(importance, name='importance'))
def _build_graph(self, inputs): state, action, futurereward = inputs policy, self.value = self._get_NN_prediction(state) self.value = tf.squeeze(self.value, [1], name='pred_value') # (B,) self.logits = tf.nn.softmax(policy, name='logits') expf = tf.get_variable('explore_factor', shape=[], initializer=tf.constant_initializer(1), trainable=False) logitsT = tf.nn.softmax(policy * expf, name='logitsT') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(self.logits + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, NUM_ACTIONS), 1) advantage = tf.sub(tf.stop_gradient(self.value), futurereward, name='advantage') policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage, name='policy_loss') xentropy_loss = tf.reduce_sum( self.logits * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(self.value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage) entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss]) self.cost = tf.truediv(self.cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost')
def _mapper(self, grad, var): name = var.op.name if name not in _summaried_gradient: _summaried_gradient.add(name) tf.summary.histogram(name + '-grad', grad) from tensorpack.tfutils.symbolic_functions import rms tf.summary.scalar(name + '/rms', rms(grad)) return grad
def _build_graph(self, inputs): state, action, futurereward1, futurereward2, updateweight1, updateweight2, action_prob = inputs logits, value1, value2 = self._get_NN_prediction(state) value1 = tf.squeeze(value1, [1], name='pred_value_1') # (B,) value2 = tf.squeeze(value2, [1], name='pred_value_2') # (B,) policy = tf.nn.softmax(logits, name='policy') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(policy + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, NUM_ACTIONS), 1) advantage1 = tf.subtract(tf.stop_gradient(value1), futurereward1, name='advantage_1') advantage2 = tf.subtract(tf.stop_gradient(value2), futurereward2, name='advantage_2') pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1) # (B,) importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10)) policy_loss1 = tf.reduce_sum(log_pi_a_given_s * advantage1 * importance * updateweight1, name='policy_loss_1') policy_loss2 = tf.reduce_sum(log_pi_a_given_s * advantage2 * importance * updateweight2, name='policy_loss_2') policy_loss = tf.add(policy_loss1, policy_loss2, name='policy_loss') xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss') value_loss1 = tf.nn.l2_loss((value1 - futurereward1) * tf.sqrt(updateweight1), name='value_loss_1') value_loss2 = tf.nn.l2_loss((value2 - futurereward2) * tf.sqrt(updateweight2), name='value_loss_2') value_loss = tf.add(value_loss1, value_loss2, name='value_loss') pred_reward1 = tf.reduce_mean(value1, name='predict_reward_1') pred_reward2 = tf.reduce_mean(value2, name='predict_reward_2') pred_reward_avg = tf.add(pred_reward1 * 0.5, pred_reward2 * 0.5, name='predict_reward_avg') advantage1 = symbf.rms(advantage1, name='rms_advantage_1') advantage2 = symbf.rms(advantage2, name='rms_advantage_2') advantage_avg = symbf.rms(advantage1 * 0.5 + advantage2 * 0.5, name='rms_advantage_avg') entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) self.cost = tf.add_n([policy_loss1, policy_loss2, xentropy_loss * entropy_beta, value_loss1, value_loss2]) self.cost = tf.truediv(self.cost, tf.cast(tf.shape(futurereward1)[0], tf.float32), name='cost') summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward1, pred_reward2, pred_reward_avg, advantage1, advantage2, advantage_avg, self.cost, tf.reduce_mean(importance, name='importance'))
def _build_graph(self, inputs): state, action, futurereward = inputs policy, self.value = self._get_NN_prediction(state) self.value = tf.squeeze(self.value, [1], name='pred_value') # (B,) self.logits = tf.nn.softmax(policy, name='logits') expf = tf.get_variable('explore_factor', shape=[], initializer=tf.constant_initializer(1), trainable=False) logitsT = tf.nn.softmax(policy * expf, name='logitsT') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(self.logits + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, self.number_of_actions), 1) advantage = tf.sub(tf.stop_gradient(self.value), futurereward, name='advantage') policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage, name='policy_loss') xentropy_loss = tf.reduce_sum(self.logits * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(self.value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage) entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) self.cost = tf.add_n( [policy_loss, xentropy_loss * entropy_beta, value_loss]) self.cost = tf.truediv(self.cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost') # print "DEBUGGING INFO:{}".format(DEBUGING_INFO) # assert 1 == 0, "AAA" if DEBUGING_INFO: logits_mean, logits_var = tf.nn.moments(self.logits, axes=[1]) # logits_mean_r = tf.reduce_sum(logits_mean) logits_var_r = tf.reduce_sum(logits_var) # tf.scalar_summary('logits_mean', logits_mean_r) tf.scalar_summary('logits_var', logits_var_r) tf.scalar_summary('entropy beta', entropy_beta) tf.scalar_summary('explore factor', expf)
def perform(var, action): ndim = var.get_shape().ndims name = var.name.replace(':0', '') if action == 'scalar': assert ndim == 0, "Scalar summary on high-dimension data. Maybe you want 'mean'?" tf.summary.scalar(name, var) return assert ndim > 0, "Cannot perform {} summary on scalar data".format(action) if action == 'histogram': tf.summary.histogram(name, var) return if action == 'sparsity': tf.summary.scalar(name + '-sparsity', tf.nn.zero_fraction(var)) return if action == 'mean': tf.summary.scalar(name + '-mean', tf.reduce_mean(var)) return if action == 'rms': tf.summary.scalar(name + '-rms', rms(var)) return if action == 'absmax': tf.summary.scalar(name + '-absmax', tf.reduce_max(tf.abs(var))) return raise RuntimeError("Unknown summary type: {}".format(action))
def _build_ad_nn(self, tensor_io): from drlutils.dataflow.tensor_io import TensorIO assert (isinstance(tensor_io, TensorIO)) from drlutils.model.base import get_current_nn_context from tensorpack.tfutils.common import get_global_step_var global_step = get_global_step_var() nnc = get_current_nn_context() is_training = nnc.is_training i_state = tensor_io.getInputTensor('state') i_agentIdent = tensor_io.getInputTensor('agentIdent') i_sequenceLength = tensor_io.getInputTensor('sequenceLength') i_resetRNN = tensor_io.getInputTensor('resetRNN') l = i_state # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ') # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ') # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ') # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ') with tf.variable_scope('critic', reuse=nnc.reuse) as vs: def _get_cell(): cell = tf.nn.rnn_cell.BasicLSTMCell(256) # if is_training: # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9) return cell cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)]) rnn_outputs = self._buildRNN( l, cell, tensor_io.batchSize, i_agentIdent=i_agentIdent, i_sequenceLength=i_sequenceLength, i_resetRNN=i_resetRNN, ) rnn_outputs = tf.reshape( rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]]) l = rnn_outputs from ad_cur.autodrive.model.selu import fc_selu for lidx in range(2): l = fc_selu( l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) value = tf.layers.dense(l, 1, name='fc-value') value = tf.squeeze(value, [1], name="value") if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor', reuse=nnc.reuse) as vs: l = tf.stop_gradient(l) l = tf.layers.dense(l, 128, activation=tf.nn.relu6, name='fc-actor') mu_steering = 0.5 * tf.layers.dense( l, 1, activation=tf.nn.tanh, name='fc-mu-steering') mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel') mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) def saturating_sigmoid(x): """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1].""" with tf.name_scope("saturating_sigmoid", [x]): y = tf.sigmoid(x) return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1)) sigma_steering_ = 0.1 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering') sigma_accel_ = 0.25 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel') if not nnc.is_evaluating: sigma_beta_steering = tf.get_default_graph( ).get_tensor_by_name('actor/sigma_beta_steering:0') sigma_beta_accel = tf.get_default_graph().get_tensor_by_name( 'actor/sigma_beta_accel:0') sigma_beta_steering = tf.constant(1e-4) # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp') # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp') else: sigma_beta_steering = tf.constant(1e-4) sigma_beta_accel = tf.constant(1e-4) sigma_steering = (sigma_steering_ + sigma_beta_steering) sigma_accel = (sigma_accel_ + sigma_beta_accel) sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas + 0.01) policy = tf.squeeze(dists.sample([1]), [0]) # 裁剪到两倍方差之内 policy = tf.clip_by_value(policy, mus - 2 * sigmas, mus + 2 * sigmas) if is_training: self._addMovingSummary( tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), # sigma_beta_accel, # sigma_beta_steering, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) if not is_training: tensor_io.setOutputTensors(policy, value, mus, sigmas) return i_actions = tensor_io.getInputTensor("action") # i_actions = tf.Print(i_actions, [i_actions], 'actions = ') i_actions = tf.reshape(i_actions, [-1] + i_actions.get_shape().as_list()[2:]) log_probs = dists.log_prob(i_actions) # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) i_advantage = tensor_io.getInputTensor("advantage") i_advantage = tf.reshape(i_advantage, [-1] + i_advantage.get_shape().as_list()[2:]) exp_v = log_probs * tf.expand_dims(i_advantage, -1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy + exp_v loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') i_futurereward = tensor_io.getInputTensor("futurereward") i_futurereward = tf.reshape(i_futurereward, [-1] + i_futurereward.get_shape().as_list()[2:]) loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward)) loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(value, name='predict_reward') import tensorpack.tfutils.symbolic_functions as symbf advantage = symbf.rms(i_advantage, name='rms_advantage') self._addMovingSummary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(policy[:, 0], name='actor/steering/mean'), tf.reduce_mean(policy[:, 1], name='actor/accel/mean'), ) return loss_policy, loss_value
def _build_graph(self, inputs): state, action, futurereward = inputs with tf.variable_scope('potential'): action_prediction_logits = self._get_human_action_prediction(state) action_prediction = tf.nn.softmax(action_prediction_logits) action_prediction = tf.stop_gradient(action_prediction) logits, self.value = self._get_NN_prediction(state) # reward shaping with negative cross_entropy # cross_entropy returns values close to 0 if labels and logits agree # and values growing more and more towards inf if labels and logits disagree mean_score = tf.get_variable('mean_score', shape=[], initializer=tf.constant_initializer(0), trainable=False) avg_cross_entropy = -np.log(1 / float(NUM_ACTIONS)) avg_human_performance = tf.constant(self.avg_human_performance) temperature = tf.nn.relu( (avg_human_performance - mean_score) / avg_human_performance, name='temperature') shaping = tf.nn.softmax_cross_entropy_with_logits( labels=action_prediction, logits=logits) shaping_loss = temperature * tf.reduce_sum(shaping) shaping_beta = tf.get_variable( 'shaping_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) #reward_shaping = -tf.clip_by_value(reward_shaping, 0.0, avg_cross_entropy) summary.add_moving_summary( tf.reduce_mean(futurereward, name='futurereward'), tf.reduce_mean(shaping, name='mean_shaping_loss')) self.value = tf.squeeze(self.value, [1], name='pred_value') # (B,) self.policy = tf.nn.softmax(logits, name='policy') expf = tf.get_variable('explore_factor', shape=[], initializer=tf.constant_initializer(1), trainable=False) policy_explore = tf.nn.softmax(logits * expf, name='policy_explore') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(self.policy + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, NUM_ACTIONS), 1) advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage') policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage, name='policy_loss') xentropy_loss = tf.reduce_sum(self.policy * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(self.value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) self.cost = tf.add_n([ policy_loss, xentropy_loss * entropy_beta, value_loss, shaping_beta * shaping_loss ]) self.cost = tf.truediv(self.cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost') summary.add_moving_summary(policy_loss, xentropy_loss, temperature, value_loss, pred_reward, advantage, self.cost)
def _build_graph(self, inputs): from tensorpack.tfutils.common import get_global_step_var state, action, futurereward, advantage = inputs is_training = get_current_tower_context().is_training policy, value, dists = self._get_NN_prediction(state) if not hasattr(self, '_weights_train'): self._weights_train = self._weights_critic + self._weights_actor self.value = tf.squeeze(value, [1], name='value') # (B,) self.policy = tf.identity(policy, name='policy') with tf.variable_scope("Pred") as vs: __p, __v, _ = self._get_NN_prediction(state) __v = tf.squeeze(__v, [1], name='value') # (B,) __p = tf.identity(__p, name='policy') if not hasattr(self, '_weights_pred'): self._weights_pred = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) assert (len(self._weights_train) == len(self._weights_pred)) assert (not hasattr(self, '_sync_op')) self._sync_op = tf.group(*[d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train)]) with tf.variable_scope('pre') as vs: pre_p,pre_v,pre_dists=self._get_NN_prediction(state) if not hasattr(self,'pre_weights'): self.pre_weights=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=vs.name) self._td_sync_op = tf.group(*[d.assign(s) for d, s in zip(self.pre_weights, self._weights_train)]) if not is_training: return # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage') # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4) log_probs = dists.log_prob(action) #add ppo policy clip loss #add ratio ,surr1, surr2 pre_probs=pre_dists.log_prob(action) ratio=tf.exp(log_probs-pre_probs) prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1) clip_param=tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param') # surr1=prob_ratio*advantage surr1=ratio*tf.expand_dims(advantage, -1) surr2=tf.clip_by_value(ratio,1.0-clip_param,1.0+clip_param)*tf.expand_dims(advantage, -1) # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage loss_policy=-tf.reduce_mean(tf.minimum(surr1,surr2)) #add critic clip loss v_loss1=tf.square(value-futurereward) pre_value=pre_v+tf.clip_by_value(value-pre_v,-clip_param,clip_param) v_loss2=tf.square(pre_v-futurereward) # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2)) loss_value=0.5*tf.reduce_mean(v_loss1) entropy = dists.entropy() entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') loss_policy=loss_policy+loss_entropy # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) # exp_v = log_probs * tf.expand_dims(advantage, -1) # entropy = dists.entropy() # entropy_beta = tf.get_variable('entropy_beta', shape=[], # initializer=tf.constant_initializer(0.01), trainable=False) # exp_v = entropy_beta * entropy + exp_v # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward)) # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._cost = [loss_policy, loss_value ] from autodrive.trainer.summary import addParamSummary addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary(loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'), tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'), )
def _build_graph(self, inputs): from tensorpack.tfutils.common import get_global_step_var state, action, futurereward, advantage = inputs is_training = get_current_tower_context().is_training policy, value, dists = self._get_NN_prediction(state) if not hasattr(self, '_weights_train'): self._weights_train = self._weights_critic + self._weights_actor self.value = tf.squeeze(value, [1], name='value') # (B,) self.policy = tf.identity(policy, name='policy') with tf.variable_scope("Pred") as vs: __p, __v, _ = self._get_NN_prediction(state) __v = tf.squeeze(__v, [1], name='value') # (B,) __p = tf.identity(__p, name='policy') if not hasattr(self, '_weights_pred'): self._weights_pred = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) assert (len(self._weights_train) == len(self._weights_pred)) assert (not hasattr(self, '_sync_op')) self._sync_op = tf.group(*[ d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train) ]) with tf.variable_scope('pre') as vs: pre_p, pre_v, pre_dists = self._get_NN_prediction(state) if not hasattr(self, 'pre_weights'): self.pre_weights = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) self._td_sync_op = tf.group(*[ d.assign(s) for d, s in zip(self.pre_weights, self._weights_train) ]) if not is_training: return # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage') # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4) log_probs = dists.log_prob(action) #add ppo policy clip loss #add ratio ,surr1, surr2 pre_probs = pre_dists.log_prob(action) ratio = tf.exp(log_probs - pre_probs) prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1) clip_param = tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param') # surr1=prob_ratio*advantage surr1 = ratio * tf.expand_dims(advantage, -1) surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * tf.expand_dims(advantage, -1) # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage loss_policy = -tf.reduce_mean(tf.minimum(surr1, surr2)) #add critic clip loss v_loss1 = tf.square(value - futurereward) pre_value = pre_v + tf.clip_by_value(value - pre_v, -clip_param, clip_param) v_loss2 = tf.square(pre_v - futurereward) # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2)) loss_value = 0.5 * tf.reduce_mean(v_loss1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') loss_policy = loss_policy + loss_entropy # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) # exp_v = log_probs * tf.expand_dims(advantage, -1) # entropy = dists.entropy() # entropy_beta = tf.get_variable('entropy_beta', shape=[], # initializer=tf.constant_initializer(0.01), trainable=False) # exp_v = entropy_beta * entropy + exp_v # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward)) # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._cost = [loss_policy, loss_value] from autodrive.trainer.summary import addParamSummary addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'), tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'), )