def test_apply_regularization_invalid_regularizer(self): non_scalar_regularizer = lambda x: array_ops.tile(x, [2]) tensor_weights_list = [ constant_op.constant(x) for x in [[1.5], [2, 3, 4.2], [10, 42, 666.6]] ] with self.cached_session(): with self.assertRaises(ValueError): regularizers.apply_regularization(non_scalar_regularizer, tensor_weights_list)
def test_apply_regularization_invalid_regularizer(self): non_scalar_regularizer = lambda x: array_ops.tile(x, [2]) tensor_weights_list = [ constant_op.constant(x) for x in [[1.5], [2, 3, 4.2], [10, 42, 666.6]] ] with self.test_session(): with self.assertRaises(ValueError): regularizers.apply_regularization(non_scalar_regularizer, tensor_weights_list)
def neural_attention(embedding_dim=384, encoding_dim=128): embeddings = tf.Variable(tf.random_normal([vocab_size, embedding_dim], stddev=0.22), dtype=tf.float32) regularizers.apply_regularization(regularizers.l2_regularizer(1e-4), [embeddings]) with tf.variable_scope('encode'): with tf.variable_scope('X'): X_lens = tf.reduce_sum(tf.sign(tf.abs(X)), 1) embedded_X = tf.nn.embedding_lookup(embeddings, X) encoded_X = tf.nn.dropout(embedded_X, keep_prob) gru_cell = tf.nn.rnn_cell.GRUCell(embedding_dim) outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, embedded_X, sequence_length=X_lens, dtype=tf.float32, swap_memory=True) encoded_X = tf.concat(outputs, 2) with tf.variable_scope('Q'): Q_lens = tf.reduce_sum(tf.sign(tf.abs(Q)), 1) embedded_Q = tf.nn.embedding_lookup(embeddings, Q) encoded_Q = tf.nn.dropout(embedded_Q, keep_prob) gru_cell = tf.nn.rnn_cell.GRUCell(encoding_dim) outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_Q, sequence_length=Q_lens, dtype=tf.float32, swap_memory=True) encoded_Q = tf.concat(outputs, 2) W_q = tf.Variable(tf.random_normal([2 * encoding_dim, 4 * encoding_dim], stddev=0.22), dtype=tf.float32) b_q = tf.Variable(tf.random_normal([2 * encoding_dim, 1], stddev=0.22), dtype=tf.float32) W_d = tf.Variable(tf.random_normal([2 * encoding_dim, 6 * encoding_dim], stddev=0.22), dtype=tf.float32) b_d = tf.Variable(tf.random_normal([2 * encoding_dim, 1], stddev=0.22), dtype=tf.float32) g_q = tf.Variable(tf.random_normal([10 * encoding_dim, 2 * encoding_dim], stddev=0.22), dtype=tf.float32) g_d = tf.Variable(tf.random_normal([10 * encoding_dim, 2 * encoding_dim], stddev=0.22), dtype=tf.float32) with tf.variable_scope('attend') as scope: infer_gru = tf.nn.rnn_cell.GRUCell(4*encoding_dim) infer_state = infer_gru.zero_state(batch_size, tf.float32) for iter_step in range(8): if iter_step > 0: scope.reuse_variables() _, q_glimpse = glimpse(W_q, b_q, encoded_Q, infer_state) d_attention, d_glimpse = glimpse(W_d, b_d, encoded_X, tf.concat([infer_state, q_glimpse], 1)) gate_concat = tf.concat([infer_state, q_glimpse, d_glimpse, q_glimpse * d_glimpse], 1) r_d = tf.sigmoid(tf.matmul(gate_concat, g_d)) r_d = tf.nn.dropout(r_d, keep_prob) r_q = tf.sigmoid(tf.matmul(gate_concat, g_q)) r_q = tf.nn.dropout(r_q, keep_prob) combined_gated_glimpse = tf.concat([r_q * q_glimpse, r_d * d_glimpse], 1) _, infer_state = infer_gru(combined_gated_glimpse, infer_state) return tf.to_float(tf.sign(tf.abs(X))) * d_attention
def build_graph(self): self._construct_weights() saver, logits, KL = self.forward_pass() log_softmax_var = tf.nn.log_softmax(logits) neg_ll = -tf.reduce_mean( tf.reduce_sum(log_softmax_var * self.input_ph, axis=-1)) # apply regularization to weights reg = l2_regularizer(self.lam) reg_var = apply_regularization(reg, self.weights_q + self.weights_p) # tensorflow l2 regularization multiply 0.5 to the l2 norm # multiply 2 so that it is back in the same scale neg_ELBO = neg_ll + self.anneal_ph * KL + 2 * reg_var train_op = tf.train.AdamOptimizer(self.lr).minimize(neg_ELBO) # add summary statistics tf.summary.scalar('negative_multi_ll', neg_ll) tf.summary.scalar('KL', KL) tf.summary.scalar('neg_ELBO_train', neg_ELBO) merged = tf.summary.merge_all() return saver, logits, neg_ELBO, train_op, merged
def test_apply_zero_regularization(self): regularizer = regularizers.l2_regularizer(0.0) array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]] tensor_weights_list = [constant_op.constant(x) for x in array_weights_list] with self.cached_session(): result = regularizers.apply_regularization(regularizer, tensor_weights_list) self.assertAllClose(0.0, result.eval())
def test_apply_regularization(self): dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x) array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]] tensor_weights_list = [constant_op.constant(x) for x in array_weights_list] expected = sum([2 * x for l in array_weights_list for x in l]) with self.cached_session(): result = regularizers.apply_regularization(dummy_regularizer, tensor_weights_list) self.assertAllClose(expected, result.eval())
def test_apply_zero_regularization(self): regularizer = regularizers.l2_regularizer(0.0) array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]] tensor_weights_list = [ constant_op.constant(x) for x in array_weights_list ] with self.test_session(): result = regularizers.apply_regularization(regularizer, tensor_weights_list) self.assertAllClose(0.0, result.eval())
def test_apply_regularization(self): dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x) array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]] tensor_weights_list = [ constant_op.constant(x) for x in array_weights_list ] expected = sum([2 * x for l in array_weights_list for x in l]) with self.test_session(): result = regularizers.apply_regularization(dummy_regularizer, tensor_weights_list) self.assertAllClose(expected, result.eval())
def build_graph(self): self.construct_weights() saver, logits = self.forward_pass() log_softmax_var = tf.nn.log_softmax(logits) # per-user average negative log-likelihood neg_ll = -tf.reduce_mean( tf.reduce_sum(log_softmax_var * self.input_ph, axis=1)) # apply regularization to weights reg = l2_regularizer(self.lam) reg_var = apply_regularization(reg, self.weights) # tensorflow l2 regularization multiply 0.5 to the l2 norm # multiply 2 so that it is back in the same scale loss = neg_ll + 2 * reg_var train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) # add summary statistics tf.summary.scalar('negative_multi_ll', neg_ll) tf.summary.scalar('loss', loss) merged = tf.summary.merge_all() return saver, logits, loss, train_op, merged
def _build_ad_nn(self, tensor_io): from drlutils.dataflow.tensor_io import TensorIO assert (isinstance(tensor_io, TensorIO)) from drlutils.model.base import get_current_nn_context from tensorpack.tfutils.common import get_global_step_var global_step = get_global_step_var() nnc = get_current_nn_context() is_training = nnc.is_training i_state = tensor_io.getInputTensor('state') i_agentIdent = tensor_io.getInputTensor('agentIdent') i_sequenceLength = tensor_io.getInputTensor('sequenceLength') i_resetRNN = tensor_io.getInputTensor('resetRNN') l = i_state # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ') # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ') # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ') # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ') with tf.variable_scope('critic', reuse=nnc.reuse) as vs: def _get_cell(): cell = tf.nn.rnn_cell.BasicLSTMCell(256) # if is_training: # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9) return cell cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)]) rnn_outputs = self._buildRNN( l, cell, tensor_io.batchSize, i_agentIdent=i_agentIdent, i_sequenceLength=i_sequenceLength, i_resetRNN=i_resetRNN, ) rnn_outputs = tf.reshape( rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]]) l = rnn_outputs from ad_cur.autodrive.model.selu import fc_selu for lidx in range(2): l = fc_selu( l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) value = tf.layers.dense(l, 1, name='fc-value') value = tf.squeeze(value, [1], name="value") if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor', reuse=nnc.reuse) as vs: l = tf.stop_gradient(l) l = tf.layers.dense(l, 128, activation=tf.nn.relu6, name='fc-actor') mu_steering = 0.5 * tf.layers.dense( l, 1, activation=tf.nn.tanh, name='fc-mu-steering') mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel') mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) def saturating_sigmoid(x): """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1].""" with tf.name_scope("saturating_sigmoid", [x]): y = tf.sigmoid(x) return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1)) sigma_steering_ = 0.1 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering') sigma_accel_ = 0.25 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel') if not nnc.is_evaluating: sigma_beta_steering = tf.get_default_graph( ).get_tensor_by_name('actor/sigma_beta_steering:0') sigma_beta_accel = tf.get_default_graph().get_tensor_by_name( 'actor/sigma_beta_accel:0') sigma_beta_steering = tf.constant(1e-4) # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp') # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp') else: sigma_beta_steering = tf.constant(1e-4) sigma_beta_accel = tf.constant(1e-4) sigma_steering = (sigma_steering_ + sigma_beta_steering) sigma_accel = (sigma_accel_ + sigma_beta_accel) sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas + 0.01) policy = tf.squeeze(dists.sample([1]), [0]) # 裁剪到两倍方差之内 policy = tf.clip_by_value(policy, mus - 2 * sigmas, mus + 2 * sigmas) if is_training: self._addMovingSummary( tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), # sigma_beta_accel, # sigma_beta_steering, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) if not is_training: tensor_io.setOutputTensors(policy, value, mus, sigmas) return i_actions = tensor_io.getInputTensor("action") # i_actions = tf.Print(i_actions, [i_actions], 'actions = ') i_actions = tf.reshape(i_actions, [-1] + i_actions.get_shape().as_list()[2:]) log_probs = dists.log_prob(i_actions) # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) i_advantage = tensor_io.getInputTensor("advantage") i_advantage = tf.reshape(i_advantage, [-1] + i_advantage.get_shape().as_list()[2:]) exp_v = log_probs * tf.expand_dims(i_advantage, -1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy + exp_v loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') i_futurereward = tensor_io.getInputTensor("futurereward") i_futurereward = tf.reshape(i_futurereward, [-1] + i_futurereward.get_shape().as_list()[2:]) loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward)) loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(value, name='predict_reward') import tensorpack.tfutils.symbolic_functions as symbf advantage = symbf.rms(i_advantage, name='rms_advantage') self._addMovingSummary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(policy[:, 0], name='actor/steering/mean'), tf.reduce_mean(policy[:, 1], name='actor/accel/mean'), ) return loss_policy, loss_value
def _build_graph(self, inputs): from tensorpack.tfutils.common import get_global_step_var state, action, futurereward, advantage = inputs is_training = get_current_tower_context().is_training policy, value, dists = self._get_NN_prediction(state) if not hasattr(self, '_weights_train'): self._weights_train = self._weights_critic + self._weights_actor self.value = tf.squeeze(value, [1], name='value') # (B,) self.policy = tf.identity(policy, name='policy') with tf.variable_scope("Pred") as vs: __p, __v, _ = self._get_NN_prediction(state) __v = tf.squeeze(__v, [1], name='value') # (B,) __p = tf.identity(__p, name='policy') if not hasattr(self, '_weights_pred'): self._weights_pred = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) assert (len(self._weights_train) == len(self._weights_pred)) assert (not hasattr(self, '_sync_op')) self._sync_op = tf.group(*[d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train)]) with tf.variable_scope('pre') as vs: pre_p,pre_v,pre_dists=self._get_NN_prediction(state) if not hasattr(self,'pre_weights'): self.pre_weights=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=vs.name) self._td_sync_op = tf.group(*[d.assign(s) for d, s in zip(self.pre_weights, self._weights_train)]) if not is_training: return # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage') # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4) log_probs = dists.log_prob(action) #add ppo policy clip loss #add ratio ,surr1, surr2 pre_probs=pre_dists.log_prob(action) ratio=tf.exp(log_probs-pre_probs) prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1) clip_param=tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param') # surr1=prob_ratio*advantage surr1=ratio*tf.expand_dims(advantage, -1) surr2=tf.clip_by_value(ratio,1.0-clip_param,1.0+clip_param)*tf.expand_dims(advantage, -1) # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage loss_policy=-tf.reduce_mean(tf.minimum(surr1,surr2)) #add critic clip loss v_loss1=tf.square(value-futurereward) pre_value=pre_v+tf.clip_by_value(value-pre_v,-clip_param,clip_param) v_loss2=tf.square(pre_v-futurereward) # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2)) loss_value=0.5*tf.reduce_mean(v_loss1) entropy = dists.entropy() entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') loss_policy=loss_policy+loss_entropy # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) # exp_v = log_probs * tf.expand_dims(advantage, -1) # entropy = dists.entropy() # entropy_beta = tf.get_variable('entropy_beta', shape=[], # initializer=tf.constant_initializer(0.01), trainable=False) # exp_v = entropy_beta * entropy + exp_v # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward)) # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._cost = [loss_policy, loss_value ] from autodrive.trainer.summary import addParamSummary addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary(loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'), tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'), )
def _build_graph(self, inputs): from tensorpack.tfutils.common import get_global_step_var state, action, futurereward, advantage = inputs is_training = get_current_tower_context().is_training policy, value, dists = self._get_NN_prediction(state) if not hasattr(self, '_weights_train'): self._weights_train = self._weights_critic + self._weights_actor self.value = tf.squeeze(value, [1], name='value') # (B,) self.policy = tf.identity(policy, name='policy') with tf.variable_scope("Pred") as vs: __p, __v, _ = self._get_NN_prediction(state) __v = tf.squeeze(__v, [1], name='value') # (B,) __p = tf.identity(__p, name='policy') if not hasattr(self, '_weights_pred'): self._weights_pred = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) assert (len(self._weights_train) == len(self._weights_pred)) assert (not hasattr(self, '_sync_op')) self._sync_op = tf.group(*[ d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train) ]) with tf.variable_scope('pre') as vs: pre_p, pre_v, pre_dists = self._get_NN_prediction(state) if not hasattr(self, 'pre_weights'): self.pre_weights = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) self._td_sync_op = tf.group(*[ d.assign(s) for d, s in zip(self.pre_weights, self._weights_train) ]) if not is_training: return # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage') # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4) log_probs = dists.log_prob(action) #add ppo policy clip loss #add ratio ,surr1, surr2 pre_probs = pre_dists.log_prob(action) ratio = tf.exp(log_probs - pre_probs) prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1) clip_param = tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param') # surr1=prob_ratio*advantage surr1 = ratio * tf.expand_dims(advantage, -1) surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * tf.expand_dims(advantage, -1) # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage loss_policy = -tf.reduce_mean(tf.minimum(surr1, surr2)) #add critic clip loss v_loss1 = tf.square(value - futurereward) pre_value = pre_v + tf.clip_by_value(value - pre_v, -clip_param, clip_param) v_loss2 = tf.square(pre_v - futurereward) # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2)) loss_value = 0.5 * tf.reduce_mean(v_loss1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') loss_policy = loss_policy + loss_entropy # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) # exp_v = log_probs * tf.expand_dims(advantage, -1) # entropy = dists.entropy() # entropy_beta = tf.get_variable('entropy_beta', shape=[], # initializer=tf.constant_initializer(0.01), trainable=False) # exp_v = entropy_beta * entropy + exp_v # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward)) # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._cost = [loss_policy, loss_value] from autodrive.trainer.summary import addParamSummary addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'), tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'), )