Exemplo n.º 1
0
 def test_apply_regularization_invalid_regularizer(self):
   non_scalar_regularizer = lambda x: array_ops.tile(x, [2])
   tensor_weights_list = [
       constant_op.constant(x) for x in [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
   ]
   with self.cached_session():
     with self.assertRaises(ValueError):
       regularizers.apply_regularization(non_scalar_regularizer,
                                         tensor_weights_list)
 def test_apply_regularization_invalid_regularizer(self):
     non_scalar_regularizer = lambda x: array_ops.tile(x, [2])
     tensor_weights_list = [
         constant_op.constant(x)
         for x in [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
     ]
     with self.test_session():
         with self.assertRaises(ValueError):
             regularizers.apply_regularization(non_scalar_regularizer,
                                               tensor_weights_list)
Exemplo n.º 3
0
def neural_attention(embedding_dim=384, encoding_dim=128):
    embeddings = tf.Variable(tf.random_normal([vocab_size, embedding_dim], stddev=0.22), dtype=tf.float32)
    regularizers.apply_regularization(regularizers.l2_regularizer(1e-4), [embeddings])

    with tf.variable_scope('encode'):
        with tf.variable_scope('X'):
            X_lens = tf.reduce_sum(tf.sign(tf.abs(X)), 1)
            embedded_X = tf.nn.embedding_lookup(embeddings, X)
            encoded_X = tf.nn.dropout(embedded_X, keep_prob)
            gru_cell = tf.nn.rnn_cell.GRUCell(embedding_dim)
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, embedded_X, sequence_length=X_lens, dtype=tf.float32, swap_memory=True)
            encoded_X = tf.concat(outputs, 2)
        with tf.variable_scope('Q'):
            Q_lens = tf.reduce_sum(tf.sign(tf.abs(Q)), 1)
            embedded_Q = tf.nn.embedding_lookup(embeddings, Q)
            encoded_Q = tf.nn.dropout(embedded_Q, keep_prob)
            gru_cell = tf.nn.rnn_cell.GRUCell(encoding_dim)
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(gru_cell, gru_cell, encoded_Q,
                                                                     sequence_length=Q_lens, dtype=tf.float32,
                                                                     swap_memory=True)
            encoded_Q = tf.concat(outputs, 2)

    W_q = tf.Variable(tf.random_normal([2 * encoding_dim, 4 * encoding_dim], stddev=0.22), dtype=tf.float32)
    b_q = tf.Variable(tf.random_normal([2 * encoding_dim, 1], stddev=0.22), dtype=tf.float32)
    W_d = tf.Variable(tf.random_normal([2 * encoding_dim, 6 * encoding_dim], stddev=0.22), dtype=tf.float32)
    b_d = tf.Variable(tf.random_normal([2 * encoding_dim, 1], stddev=0.22), dtype=tf.float32)
    g_q = tf.Variable(tf.random_normal([10 * encoding_dim, 2 * encoding_dim], stddev=0.22), dtype=tf.float32)
    g_d = tf.Variable(tf.random_normal([10 * encoding_dim, 2 * encoding_dim], stddev=0.22), dtype=tf.float32)

    with tf.variable_scope('attend') as scope:
        infer_gru = tf.nn.rnn_cell.GRUCell(4*encoding_dim)
        infer_state = infer_gru.zero_state(batch_size, tf.float32)
        for iter_step in range(8):
            if iter_step > 0:
                scope.reuse_variables()

            _, q_glimpse = glimpse(W_q, b_q, encoded_Q, infer_state)
            d_attention, d_glimpse = glimpse(W_d, b_d, encoded_X, tf.concat([infer_state, q_glimpse], 1))

            gate_concat = tf.concat([infer_state, q_glimpse, d_glimpse, q_glimpse * d_glimpse], 1)

            r_d = tf.sigmoid(tf.matmul(gate_concat, g_d))
            r_d = tf.nn.dropout(r_d, keep_prob)
            r_q = tf.sigmoid(tf.matmul(gate_concat, g_q))
            r_q = tf.nn.dropout(r_q, keep_prob)

            combined_gated_glimpse = tf.concat([r_q * q_glimpse, r_d * d_glimpse], 1)
            _, infer_state = infer_gru(combined_gated_glimpse, infer_state)

    return tf.to_float(tf.sign(tf.abs(X))) * d_attention
Exemplo n.º 4
0
Arquivo: vae.py Projeto: rn5l/rsc18
    def build_graph(self):
        self._construct_weights()

        saver, logits, KL = self.forward_pass()
        log_softmax_var = tf.nn.log_softmax(logits)

        neg_ll = -tf.reduce_mean(
            tf.reduce_sum(log_softmax_var * self.input_ph, axis=-1))
        # apply regularization to weights
        reg = l2_regularizer(self.lam)

        reg_var = apply_regularization(reg, self.weights_q + self.weights_p)
        # tensorflow l2 regularization multiply 0.5 to the l2 norm
        # multiply 2 so that it is back in the same scale
        neg_ELBO = neg_ll + self.anneal_ph * KL + 2 * reg_var

        train_op = tf.train.AdamOptimizer(self.lr).minimize(neg_ELBO)

        # add summary statistics
        tf.summary.scalar('negative_multi_ll', neg_ll)
        tf.summary.scalar('KL', KL)
        tf.summary.scalar('neg_ELBO_train', neg_ELBO)
        merged = tf.summary.merge_all()

        return saver, logits, neg_ELBO, train_op, merged
Exemplo n.º 5
0
 def test_apply_zero_regularization(self):
   regularizer = regularizers.l2_regularizer(0.0)
   array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
   tensor_weights_list = [constant_op.constant(x) for x in array_weights_list]
   with self.cached_session():
     result = regularizers.apply_regularization(regularizer,
                                                tensor_weights_list)
     self.assertAllClose(0.0, result.eval())
Exemplo n.º 6
0
 def test_apply_regularization(self):
   dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x)
   array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
   tensor_weights_list = [constant_op.constant(x) for x in array_weights_list]
   expected = sum([2 * x for l in array_weights_list for x in l])
   with self.cached_session():
     result = regularizers.apply_regularization(dummy_regularizer,
                                                tensor_weights_list)
     self.assertAllClose(expected, result.eval())
 def test_apply_zero_regularization(self):
     regularizer = regularizers.l2_regularizer(0.0)
     array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
     tensor_weights_list = [
         constant_op.constant(x) for x in array_weights_list
     ]
     with self.test_session():
         result = regularizers.apply_regularization(regularizer,
                                                    tensor_weights_list)
         self.assertAllClose(0.0, result.eval())
 def test_apply_regularization(self):
     dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x)
     array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
     tensor_weights_list = [
         constant_op.constant(x) for x in array_weights_list
     ]
     expected = sum([2 * x for l in array_weights_list for x in l])
     with self.test_session():
         result = regularizers.apply_regularization(dummy_regularizer,
                                                    tensor_weights_list)
         self.assertAllClose(expected, result.eval())
Exemplo n.º 9
0
Arquivo: dae.py Projeto: rn5l/rsc18
    def build_graph(self):

        self.construct_weights()

        saver, logits = self.forward_pass()
        log_softmax_var = tf.nn.log_softmax(logits)

        # per-user average negative log-likelihood
        neg_ll = -tf.reduce_mean(
            tf.reduce_sum(log_softmax_var * self.input_ph, axis=1))
        # apply regularization to weights
        reg = l2_regularizer(self.lam)
        reg_var = apply_regularization(reg, self.weights)
        # tensorflow l2 regularization multiply 0.5 to the l2 norm
        # multiply 2 so that it is back in the same scale
        loss = neg_ll + 2 * reg_var

        train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)

        # add summary statistics
        tf.summary.scalar('negative_multi_ll', neg_ll)
        tf.summary.scalar('loss', loss)
        merged = tf.summary.merge_all()
        return saver, logits, loss, train_op, merged
Exemplo n.º 10
0
    def _build_ad_nn(self, tensor_io):
        from drlutils.dataflow.tensor_io import TensorIO
        assert (isinstance(tensor_io, TensorIO))
        from drlutils.model.base import get_current_nn_context
        from tensorpack.tfutils.common import get_global_step_var
        global_step = get_global_step_var()
        nnc = get_current_nn_context()
        is_training = nnc.is_training
        i_state = tensor_io.getInputTensor('state')
        i_agentIdent = tensor_io.getInputTensor('agentIdent')
        i_sequenceLength = tensor_io.getInputTensor('sequenceLength')
        i_resetRNN = tensor_io.getInputTensor('resetRNN')
        l = i_state
        # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ')
        # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ')
        # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ')
        # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ')
        with tf.variable_scope('critic', reuse=nnc.reuse) as vs:

            def _get_cell():
                cell = tf.nn.rnn_cell.BasicLSTMCell(256)
                # if is_training:
                #     cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9)
                return cell

            cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)])
            rnn_outputs = self._buildRNN(
                l,
                cell,
                tensor_io.batchSize,
                i_agentIdent=i_agentIdent,
                i_sequenceLength=i_sequenceLength,
                i_resetRNN=i_resetRNN,
            )
            rnn_outputs = tf.reshape(
                rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]])
            l = rnn_outputs
            from ad_cur.autodrive.model.selu import fc_selu
            for lidx in range(2):
                l = fc_selu(
                    l,
                    200,
                    keep_prob=1.,  # 由于我们只使用传感器训练,关键信息不能丢
                    is_training=is_training,
                    name='fc-{}'.format(lidx))
            value = tf.layers.dense(l, 1, name='fc-value')
            value = tf.squeeze(value, [1], name="value")
            if not hasattr(self, '_weights_critic'):
                self._weights_critic = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        with tf.variable_scope('actor', reuse=nnc.reuse) as vs:
            l = tf.stop_gradient(l)
            l = tf.layers.dense(l,
                                128,
                                activation=tf.nn.relu6,
                                name='fc-actor')
            mu_steering = 0.5 * tf.layers.dense(
                l, 1, activation=tf.nn.tanh, name='fc-mu-steering')
            mu_accel = tf.layers.dense(l,
                                       1,
                                       activation=tf.nn.tanh,
                                       name='fc-mu-accel')
            mus = tf.concat([mu_steering, mu_accel], axis=-1)

            # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus')
            # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas')
            # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5)
            def saturating_sigmoid(x):
                """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1]."""
                with tf.name_scope("saturating_sigmoid", [x]):
                    y = tf.sigmoid(x)
                    return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1))

            sigma_steering_ = 0.1 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering')
            sigma_accel_ = 0.25 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel')

            if not nnc.is_evaluating:
                sigma_beta_steering = tf.get_default_graph(
                ).get_tensor_by_name('actor/sigma_beta_steering:0')
                sigma_beta_accel = tf.get_default_graph().get_tensor_by_name(
                    'actor/sigma_beta_accel:0')
                sigma_beta_steering = tf.constant(1e-4)
                # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp')
                # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp')
            else:
                sigma_beta_steering = tf.constant(1e-4)
                sigma_beta_accel = tf.constant(1e-4)
            sigma_steering = (sigma_steering_ + sigma_beta_steering)
            sigma_accel = (sigma_accel_ + sigma_beta_accel)

            sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1)
            # if is_training:
            #     pass
            #     # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因:
            #     #   1、训练前期尽量大的探索可以避免网络陷入局部最优
            #     #   2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来
            #
            # if is_training:
            #     sigmas += sigma_beta_steering
            # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5)
            # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5)
            # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5)
            # sigmas_orig = sigmas
            # sigmas = sigmas + sigma_beta_steering
            # sigmas = tf.minimum(sigmas + 0.1, 100)
            # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1)
            # sigma_steering += sigma_beta_steering
            # sigma_accel += sigma_beta_accel

            # mus = tf.concat([mu_steering, mu_accel], axis=-1)

            from tensorflow.contrib.distributions import Normal
            dists = Normal(mus, sigmas + 0.01)
            policy = tf.squeeze(dists.sample([1]), [0])
            # 裁剪到两倍方差之内
            policy = tf.clip_by_value(policy, mus - 2 * sigmas,
                                      mus + 2 * sigmas)
            if is_training:
                self._addMovingSummary(
                    tf.reduce_mean(mu_steering, name='mu/steering/mean'),
                    tf.reduce_mean(mu_accel, name='mu/accel/mean'),
                    tf.reduce_mean(sigma_steering, name='sigma/steering/mean'),
                    tf.reduce_max(sigma_steering, name='sigma/steering/max'),
                    tf.reduce_mean(sigma_accel, name='sigma/accel/mean'),
                    tf.reduce_max(sigma_accel, name='sigma/accel/max'),
                    # sigma_beta_accel,
                    # sigma_beta_steering,
                )
            # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions],
            #                    'mu/sigma/sigma.orig/act=', summarize=4)
            if not hasattr(self, '_weights_actor'):
                self._weights_actor = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
        if not is_training:
            tensor_io.setOutputTensors(policy, value, mus, sigmas)
            return

        i_actions = tensor_io.getInputTensor("action")
        # i_actions = tf.Print(i_actions, [i_actions], 'actions = ')
        i_actions = tf.reshape(i_actions,
                               [-1] + i_actions.get_shape().as_list()[2:])
        log_probs = dists.log_prob(i_actions)
        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        i_advantage = tensor_io.getInputTensor("advantage")
        i_advantage = tf.reshape(i_advantage,
                                 [-1] + i_advantage.get_shape().as_list()[2:])
        exp_v = log_probs * tf.expand_dims(i_advantage, -1)
        entropy = dists.entropy()
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        exp_v = entropy_beta * entropy + exp_v
        loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1),
                                     name='loss/policy')

        i_futurereward = tensor_io.getInputTensor("futurereward")
        i_futurereward = tf.reshape(i_futurereward, [-1] +
                                    i_futurereward.get_shape().as_list()[2:])
        loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward))

        loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1),
                                      name='xentropy_loss')

        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4),
                                                   self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])

        self._addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(value, name='predict_reward')
        import tensorpack.tfutils.symbolic_functions as symbf
        advantage = symbf.rms(i_advantage, name='rms_advantage')
        self._addMovingSummary(
            loss_policy,
            loss_value,
            loss_entropy,
            pred_reward,
            advantage,
            loss_l2_regularizer,
            tf.reduce_mean(policy[:, 0], name='actor/steering/mean'),
            tf.reduce_mean(policy[:, 1], name='actor/accel/mean'),
        )
        return loss_policy, loss_value
Exemplo n.º 11
0
    def _build_graph(self, inputs):
        from tensorpack.tfutils.common import get_global_step_var
        state, action, futurereward, advantage = inputs
        is_training = get_current_tower_context().is_training
        policy, value, dists = self._get_NN_prediction(state)
        if not hasattr(self, '_weights_train'):
            self._weights_train = self._weights_critic + self._weights_actor
        self.value = tf.squeeze(value, [1], name='value')  # (B,)
        self.policy = tf.identity(policy, name='policy')

        with tf.variable_scope("Pred") as vs:
            __p, __v, _ = self._get_NN_prediction(state)
            __v = tf.squeeze(__v, [1], name='value')  # (B,)
            __p = tf.identity(__p, name='policy')
            if not hasattr(self, '_weights_pred'):
                self._weights_pred = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                assert (len(self._weights_train) == len(self._weights_pred))
                assert (not hasattr(self, '_sync_op'))
                self._sync_op = tf.group(*[d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train)])

        with tf.variable_scope('pre') as vs:
            pre_p,pre_v,pre_dists=self._get_NN_prediction(state)
            if not hasattr(self,'pre_weights'):
                self.pre_weights=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=vs.name)
                self._td_sync_op = tf.group(*[d.assign(s) for d, s in zip(self.pre_weights, self._weights_train)])


        if not is_training:
            return

        # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage')
        # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4)
        log_probs = dists.log_prob(action)
        #add  ppo policy clip loss
        #add ratio  ,surr1, surr2
        pre_probs=pre_dists.log_prob(action)
        ratio=tf.exp(log_probs-pre_probs)
        prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1)
        clip_param=tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param')


        # surr1=prob_ratio*advantage
        surr1=ratio*tf.expand_dims(advantage, -1)
        surr2=tf.clip_by_value(ratio,1.0-clip_param,1.0+clip_param)*tf.expand_dims(advantage, -1)
        
        # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage

        loss_policy=-tf.reduce_mean(tf.minimum(surr1,surr2))

        #add critic clip loss
        v_loss1=tf.square(value-futurereward)
        pre_value=pre_v+tf.clip_by_value(value-pre_v,-clip_param,clip_param)
        v_loss2=tf.square(pre_v-futurereward)
        # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2))
        loss_value=0.5*tf.reduce_mean(v_loss1)
        

        entropy = dists.entropy()
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        exp_v = entropy_beta * entropy
        loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy')
        loss_policy=loss_policy+loss_entropy
        

        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        # exp_v = log_probs * tf.expand_dims(advantage, -1)
        # entropy = dists.entropy()
        # entropy_beta = tf.get_variable('entropy_beta', shape=[],
        #                                initializer=tf.constant_initializer(0.01), trainable=False)
        # exp_v = entropy_beta * entropy + exp_v
        
        # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward))

        # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss')


        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])
        self._cost = [loss_policy,
                      loss_value
                      ]
        from autodrive.trainer.summary import addParamSummary
        addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        summary.add_moving_summary(loss_policy, loss_value,
                                   loss_entropy,
                                   pred_reward, advantage,
                                   loss_l2_regularizer,
                                   tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'),
                                   tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'),
                                    )
Exemplo n.º 12
0
    def _build_graph(self, inputs):
        from tensorpack.tfutils.common import get_global_step_var
        state, action, futurereward, advantage = inputs
        is_training = get_current_tower_context().is_training
        policy, value, dists = self._get_NN_prediction(state)
        if not hasattr(self, '_weights_train'):
            self._weights_train = self._weights_critic + self._weights_actor
        self.value = tf.squeeze(value, [1], name='value')  # (B,)
        self.policy = tf.identity(policy, name='policy')

        with tf.variable_scope("Pred") as vs:
            __p, __v, _ = self._get_NN_prediction(state)
            __v = tf.squeeze(__v, [1], name='value')  # (B,)
            __p = tf.identity(__p, name='policy')
            if not hasattr(self, '_weights_pred'):
                self._weights_pred = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                assert (len(self._weights_train) == len(self._weights_pred))
                assert (not hasattr(self, '_sync_op'))
                self._sync_op = tf.group(*[
                    d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02))
                    for d, s in zip(self._weights_pred, self._weights_train)
                ])

        with tf.variable_scope('pre') as vs:
            pre_p, pre_v, pre_dists = self._get_NN_prediction(state)
            if not hasattr(self, 'pre_weights'):
                self.pre_weights = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
                self._td_sync_op = tf.group(*[
                    d.assign(s)
                    for d, s in zip(self.pre_weights, self._weights_train)
                ])

        if not is_training:
            return

        # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage')
        # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4)
        log_probs = dists.log_prob(action)
        #add  ppo policy clip loss
        #add ratio  ,surr1, surr2
        pre_probs = pre_dists.log_prob(action)
        ratio = tf.exp(log_probs - pre_probs)
        prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio,
                                                           axis=1),
                                    axis=1)
        clip_param = tf.train.exponential_decay(CLIP_PARAMETER,
                                                get_global_step_var(),
                                                10000,
                                                0.98,
                                                name='clip_param')

        # surr1=prob_ratio*advantage
        surr1 = ratio * tf.expand_dims(advantage, -1)
        surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 +
                                 clip_param) * tf.expand_dims(advantage, -1)

        # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage

        loss_policy = -tf.reduce_mean(tf.minimum(surr1, surr2))

        #add critic clip loss
        v_loss1 = tf.square(value - futurereward)
        pre_value = pre_v + tf.clip_by_value(value - pre_v, -clip_param,
                                             clip_param)
        v_loss2 = tf.square(pre_v - futurereward)
        # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2))
        loss_value = 0.5 * tf.reduce_mean(v_loss1)

        entropy = dists.entropy()
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        exp_v = entropy_beta * entropy
        loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1),
                                      name='loss/policy')
        loss_policy = loss_policy + loss_entropy

        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        # exp_v = log_probs * tf.expand_dims(advantage, -1)
        # entropy = dists.entropy()
        # entropy_beta = tf.get_variable('entropy_beta', shape=[],
        #                                initializer=tf.constant_initializer(0.01), trainable=False)
        # exp_v = entropy_beta * entropy + exp_v

        # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward))

        # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss')

        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4),
                                                   self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])
        self._cost = [loss_policy, loss_value]
        from autodrive.trainer.summary import addParamSummary
        addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(self.value, name='predict_reward')
        advantage = symbf.rms(advantage, name='rms_advantage')
        summary.add_moving_summary(
            loss_policy,
            loss_value,
            loss_entropy,
            pred_reward,
            advantage,
            loss_l2_regularizer,
            tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'),
            tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'),
        )