Exemplo n.º 1
0
class Agent():
    def __init__(self,
                 sess,
                 n_features,
                 config,
                 dic_traffic_env_conf,
                 demo=None,
                 lr=0.01):
        self.sess = sess
        self.config = config
        self._activation_fn = tf.nn.leaky_relu
        self.dic_traffic_env_conf = dic_traffic_env_conf

        # replay buffer
        self.replay_memory = Memory(capacity=self.config.replay_buffer_size,
                                    permanent_data=len(demo))
        # self.replay_memory = None
        self.demo_memory = Memory(capacity=self.config.demo_buffer_size,
                                  permanent_data=self.config.demo_buffer_size)
        self.add_demo_to_memory(demo_transitions=demo)
        self.state_dim = 16
        self.action_dim = 8

        self.s = tf.placeholder(tf.float32, [None, n_features], "state")
        self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next")
        self.q_a_ = tf.placeholder(tf.float32, [None, 1], "q_next")
        self.r = tf.placeholder(tf.float32, [None, 1], 'r')
        self.a = tf.placeholder(tf.int32, [None, 1], 'act')
        self.act_probs = tf.placeholder(tf.float32, [None, 8], 'act_probs')

        self.action_batch = tf.placeholder("int32", [None])
        self.y_input = tf.placeholder("float", [None, self.action_dim])
        self.ISWeights = tf.placeholder("float", [None, 1])
        self.n_step_y_input = tf.placeholder(
            "float", [None, self.action_dim])  # for n-step reward
        self.isdemo = tf.placeholder("float", [None])

        self.td = tf.placeholder(tf.float32, [None, 1], "td_error")  # TD_error
        self.expert_action = tf.placeholder(tf.float32, [None, 8],
                                            "expert_action")

        self.hidden = self.construct_forward(self.s,
                                             True,
                                             'None',
                                             True,
                                             "hidden",
                                             prefix='fc')

        with tf.variable_scope('Q-Value'):
            self.q = tf.layers.dense(
                inputs=self.hidden,
                units=8,  # number of hidden units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0.,
                                                                .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='Q')

        with tf.variable_scope('Q-Target'):
            self.q_target = tf.layers.dense(
                inputs=self.hidden,
                units=8,  # number of hidden units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0.,
                                                                .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='Q-Target')

        with tf.variable_scope('Actor'):
            self.probs = tf.layers.dense(
                inputs=self.hidden,
                units=8,  # output units
                activation=tf.nn.softmax,  # get action probabilities
                kernel_initializer=tf.random_normal_initializer(0.,
                                                                .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='acts_prob')

        # self.v = self.build_net("Value")
        # self.q = self.construct_forward(self.s, True, 'None', True, "Q-Value", prefix='fc')
        # self.q_target = self.construct_forward(self.s, True, 'None', True, "Q-Target", prefix='fc')
        # self.q = self.build_q_net("Q-Value")
        # self.q_target = self.build_q_net("Q-Target")
        self.q_a = tf.batch_gather(self.q, self.a)
        self.v = tf.reduce_sum(self.q * self.act_probs, axis=1, keepdims=True)
        # self.v_target = self.build_net("Target")
        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='Q-Target')
        self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope='Q-Value')
        self.replace_target_op = [
            tf.assign(t, p) for t, p in zip(self.t_params, self.params)
        ]

        self.loss
        self.optimize
        self.update_target_net
        self.abs_errors
        self.time_step = 0

        with tf.variable_scope('squared_TD_error'):
            # self.td_error = self.r + 0.8 * self.v_ - self.v
            self.td_error = self.q_a - self.v
            q_loss = tf.reduce_mean(
                tf.squared_difference(self.q_a, self.r + 0.8 * self.q_a_))
            # v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.r + 0.8 * self.v_))
            # self.loss = tf.square(self.td_error)  # TD_error = (r+gamma*V_next) - V_eval
            # self.loss = q_loss + v_loss
            self.los = q_loss
        with tf.variable_scope('train-c'):
            self.train_op_critic = tf.train.AdamOptimizer(lr).minimize(
                self.los)

        with tf.variable_scope('exp_v'):
            # log_prob = tf.log(self.acts_prob[0, self.a])
            log_prob = tf.log(tf.batch_gather(self.probs, self.a))
            self.exp_v = tf.reduce_mean(
                log_prob * self.td)  # advantage (TD_error) guided loss

        self.action = gumbel_softmax(logits=self.probs,
                                     temperature=1,
                                     hard=False)

        with tf.variable_scope('train-a'):
            self.train_op_actor = tf.train.AdamOptimizer(lr).minimize(
                -self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

        self.pretrain_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self.action, labels=self.expert_action)
        self.pretrain_op = tf.train.AdamOptimizer(lr).minimize(
            self.pretrain_loss)

    def add_demo_to_memory(self, demo_transitions):
        # add demo data to both demo_memory & replay_memory
        for t in demo_transitions:
            self.demo_memory.store(np.array(t, dtype=object))
            self.replay_memory.store(np.array(t, dtype=object))
            assert len(t) == 10

    # use the expert-demo-data to pretrain
    def pre_train(self):
        print('Pre-training ...')
        for i in range(self.config.PRETRAIN_STEPS):
            self.train_Q_network(pre_train=True)
            if i % 200 == 0 and i > 0:
                print('{} th step of pre-train finish ...'.format(i))
        self.time_step = 0
        print('All pre-train finish.')

    @lazy_property
    def abs_errors(self):
        return tf.reduce_sum(tf.abs(self.y_input - self.q),
                             axis=1)  # only use 1-step R to compute abs_errors

    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE)
        return optimizer.minimize(
            self.loss)  # only parameters in select-net is optimized here

    @lazy_property
    def update_target_net(self):
        select_params = tf.get_collection('Q-Value')
        eval_params = tf.get_collection('Q-Target')
        return [tf.assign(e, s) for e, s in zip(eval_params, select_params)]

    def learn_critic(self, s, r, s_, a, next_a, probs):
        s, s_, r = s[np.newaxis, :], s_[np.newaxis, :], r[np.newaxis, :]
        a, next_a = a[np.newaxis, :], next_a[np.newaxis, :]

        # v_ = self.sess.run(self.v, {self.s: s_})
        q_a_ = self.sess.run(self.q_a, {self.s: s_, self.a: next_a})
        td_error, _ = self.sess.run(
            [self.td_error, self.train_op_critic], {
                self.s: s,
                self.r: r,
                self.act_probs: probs,
                self.q_a_: q_a_,
                self.a: a
            })
        return td_error

    def loss_l(self, ae, a):
        return 0.0 if ae == a else 0.8

    def loss_jeq(self, q):
        jeq = 0.0
        for i in range(self.config.BATCH_SIZE):
            ae = self.action_batch[i]
            max_value = float("-inf")
            for a in range(self.action_dim):
                max_value = tf.maximum(q[i][a] + self.loss_l(ae, a), max_value)
            jeq += self.isdemo[i] * (max_value - q[i][ae])
        return jeq

    @lazy_property
    def loss(self):
        l_dq = tf.reduce_mean(tf.squared_difference(self.q, self.y_input))
        l_n_dq = tf.reduce_mean(
            tf.squared_difference(self.q, self.n_step_y_input))
        l_jeq = self.loss_jeq(self.q)
        l_l2 = tf.reduce_sum([
            tf.reduce_mean(reg_l)
            for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        ])
        return self.ISWeights * tf.reduce_sum([
            l * λ
            for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA)
        ])

    def train_Q_network(self, pre_train=False, update=True):
        """
        :param pre_train: True means should sample from demo_buffer instead of replay_buffer
        :param update: True means the action "update_target_net" executes outside, and can be ignored in the function
        """
        if not pre_train and not self.replay_memory.full(
        ):  # sampling should be executed AFTER replay_memory filled
            return
        self.time_step += 1

        assert self.replay_memory.full() or pre_train

        actual_memory = self.demo_memory if pre_train else self.replay_memory
        tree_idxes, minibatch, ISWeights = actual_memory.sample(
            self.config.BATCH_SIZE)

        np.random.shuffle(minibatch)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        demo_data = [data[5] for data in minibatch]
        n_step_reward_batch = [data[6] for data in minibatch]
        n_step_state_batch = [data[7] for data in minibatch]
        n_step_done_batch = [data[8] for data in minibatch]
        actual_n = [data[9] for data in minibatch]

        # provide for placeholder,compute first
        q_next = self.q.eval(feed_dict={self.s: next_state_batch})
        q_target_next = self.q_target.eval(
            feed_dict={self.s: next_state_batch})

        n_step_q_next = self.q.eval(feed_dict={self.s: n_step_state_batch})
        n_step_q_target_next = self.q_target.eval(
            feed_dict={self.s: n_step_state_batch})

        y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        # td_error_batch = np.zeros((self.config.BATCH_SIZE, 1))
        for i in range(self.config.BATCH_SIZE):
            # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t
            temp = self.q.eval(
                feed_dict={
                    self.s: state_batch[i].reshape((-1, self.state_dim))
                })[0]
            # v = np.sum(temp, action_prob_batch[i])
            # td_error_batch[i] = temp[action_batch[i]] - v
            temp_0 = np.copy(temp)
            # add 1-step reward
            action = np.argmax(q_next[i])
            # action = next_action_batch[i]
            temp[action_batch[i]] = reward_batch[i] + (1 - int(
                done_batch[i])) * self.config.GAMMA * q_target_next[i][action]
            y_batch[i] = temp
            # add n-step reward
            action = np.argmax(n_step_q_next[i])
            q_n_step = (
                1 - int(n_step_done_batch[i])) * self.config.GAMMA**actual_n[
                    i] * n_step_q_target_next[i][action]
            temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step
            n_step_y_batch[i] = temp_0

        _, abs_errors = self.sess.run(
            [self.optimize, self.abs_errors],
            feed_dict={
                self.y_input: y_batch,
                self.n_step_y_input: n_step_y_batch,
                self.s: state_batch,
                self.action_batch: action_batch,
                self.isdemo: demo_data,
                self.ISWeights: ISWeights
            })

        self.replay_memory.batch_update(
            tree_idxes, abs_errors)  # update priorities for data in memory

        # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制
        if update and self.time_step % self.config.UPDATE_TARGET_NET == 0:
            self.sess.run(self.update_target_net)

        return state_batch, action_batch

    def learn_actor(self, s, a, td):
        s = s[np.newaxis, :]
        a = a[np.newaxis, :]
        # td = td[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td: td}
        _, exp_v = self.sess.run([self.train_op_actor, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        probs = self.sess.run(self.probs, {self.s: s})  # 获取所有操作的概率
        return np.random.choice(np.arange(probs.shape[1]),
                                p=probs.ravel()), probs  # return a int

    def pretrain(self, state, action):
        print("Pre-training for Actor!")
        expert_action_batch = np.zeros((self.batch_size, 8))
        for i, a in enumerate(action):
            expert_action_batch[i, a] = 1
        self.sess.run(self.pretrain_op, {
            self.s: state,
            self.expert_action: expert_action_batch
        })

    def contruct_layer(self, inp, activation_fn, reuse, norm, is_train, scope):
        if norm == 'batch_norm':
            out = tf.contrib.layers.batch_norm(inp,
                                               activation_fn=activation_fn,
                                               reuse=reuse,
                                               is_training=is_train,
                                               scope=scope)
        elif norm == 'None':
            out = activation_fn(inp)
        else:
            ValueError('Can\'t recognize {}'.format(norm))
        return out

    def construct_weights(self):
        weights = {}

        weights['embed_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 4]), name='embed_w1')
        weights['embed_b1'] = tf.Variable(tf.zeros([4]), name='embed_b1')

        # for phase, one-hot
        weights['embed_w2'] = tf.Variable(tf.random_uniform_initializer(
            minval=-0.05, maxval=0.05)([2, 4]),
                                          name='embed_w2')
        #weights['embed_b2'] = tf.Variable(tf.zeros([4]), name='embed_b2')

        # lane embeding
        weights['lane_embed_w3'] = tf.Variable(
            tf.glorot_uniform_initializer()([8, 16]), name='lane_embed_w3')
        weights['lane_embed_b3'] = tf.Variable(tf.zeros([16]),
                                               name='lane_embed_b3')

        # relation embeding, one-hot
        weights['relation_embed_w4'] = tf.Variable(
            tf.random_uniform_initializer(minval=-0.05, maxval=0.05)([2, 4]),
            name='relation_embed_w4')
        #weights['relation_embed_b4'] = tf.Variable(tf.zeros([4]), name='relation_embed_b4')

        weights['feature_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 32, 20]),
            name='feature_conv_w1')
        weights['feature_conv_b1'] = tf.Variable(tf.zeros([20]),
                                                 name='feature_conv_b1')

        weights['phase_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 4, 20]),
            name='phase_conv_w1')
        weights['phase_conv_b1'] = tf.Variable(tf.zeros([20]),
                                               name='phase_conv_b1')

        weights['combine_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 20, 20]),
            name='combine_conv_w1')
        weights['combine_conv_b1'] = tf.Variable(tf.zeros([20]),
                                                 name='combine_conv_b1')

        weights['final_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 20, 1]),
            name='final_conv_w1')
        weights['final_conv_b1'] = tf.Variable(tf.zeros([1]),
                                               name='final_conv_b1')

        return weights

    def construct_forward(self,
                          inp,
                          reuse,
                          norm,
                          is_train,
                          scope,
                          prefix='fc'):
        # embedding, only for 4 or 8 phase, hard code for lane_num_vehicle + cur_phase
        with tf.variable_scope(scope):
            weights = self.construct_weights()
            dim = int(inp.shape[1].value / 2)
            num_veh = inp[:, :dim]
            num_veh = tf.reshape(num_veh, [-1, 1])

            phase = inp[:, dim:]
            phase = tf.cast(phase, tf.int32)
            phase = tf.one_hot(phase, 2)
            phase = tf.reshape(phase, [-1, 2])

            embed_num_veh = self.contruct_layer(
                tf.matmul(num_veh, weights['embed_w1']) + weights['embed_b1'],
                activation_fn=tf.nn.sigmoid,
                reuse=reuse,
                is_train=is_train,
                norm=norm,
                scope='num_veh_embed.' + prefix)
            embed_num_veh = tf.reshape(embed_num_veh, [-1, dim, 4])

            embed_phase = self.contruct_layer(tf.matmul(
                phase, weights['embed_w2']),
                                              activation_fn=tf.nn.sigmoid,
                                              reuse=reuse,
                                              is_train=is_train,
                                              norm=norm,
                                              scope='phase_embed.' + prefix)
            embed_phase = tf.reshape(embed_phase, [-1, dim, 4])

            dic_lane = {}
            for i, m in enumerate(self.dic_traffic_env_conf["LANE_PHASE_INFO"]
                                  ["start_lane"]):
                dic_lane[m] = tf.concat(
                    [embed_num_veh[:, i, :], embed_phase[:, i, :]], axis=-1)

            list_phase_pressure = []
            phase_startLane_mapping = self.dic_traffic_env_conf[
                "LANE_PHASE_INFO"]["phase_sameStartLane_mapping"]
            for phase in self.dic_traffic_env_conf["LANE_PHASE_INFO"]["phase"]:
                t1 = tf.Variable(tf.zeros(1))
                t2 = tf.Variable(tf.zeros(1))
                for lane in phase_startLane_mapping[phase][0]:
                    t1 += self.contruct_layer(
                        tf.matmul(dic_lane[lane], weights['lane_embed_w3']) +
                        weights['lane_embed_b3'],
                        activation_fn=self._activation_fn,
                        reuse=reuse,
                        is_train=is_train,
                        norm=norm,
                        scope='lane_embed.' + prefix)
                t1 /= len(phase_startLane_mapping[phase][0])

                if len(phase_startLane_mapping[phase]) >= 2:
                    for lane in phase_startLane_mapping[phase][1]:
                        t2 += self.contruct_layer(
                            tf.matmul(dic_lane[lane], weights['lane_embed_w3'])
                            + weights['lane_embed_b3'],
                            activation_fn=self._activation_fn,
                            reuse=reuse,
                            is_train=is_train,
                            norm=norm,
                            scope='lane_embed.' + prefix)
                    t2 /= len(phase_startLane_mapping[phase][1])

                list_phase_pressure.append(t1 + t2)
                # TODO check batch_size here
            constant = relation(self.dic_traffic_env_conf["LANE_PHASE_INFO"])

            constant = tf.one_hot(constant, 2)
            s1, s2 = constant.shape[1:3]
            constant = tf.reshape(constant, (-1, 2))
            relation_embedding = tf.matmul(constant,
                                           weights['relation_embed_w4'])
            relation_embedding = tf.reshape(relation_embedding,
                                            (-1, s1, s2, 4))

            list_phase_pressure_recomb = []
            num_phase = len(list_phase_pressure)

            for i in range(num_phase):
                for j in range(num_phase):
                    if i != j:
                        list_phase_pressure_recomb.append(
                            tf.concat([
                                list_phase_pressure[i], list_phase_pressure[j]
                            ],
                                      axis=-1,
                                      name="concat_compete_phase_%d_%d" %
                                      (i, j)))

            list_phase_pressure_recomb = tf.concat(list_phase_pressure_recomb,
                                                   axis=-1,
                                                   name="concat_all")
            feature_map = tf.reshape(list_phase_pressure_recomb,
                                     (-1, num_phase, num_phase - 1, 32))
            #if num_phase == 8:
            #    feature_map = tf.reshape(list_phase_pressure_recomb, (-1, 8, 7, 32))
            #else:
            #    feature_map = tf.reshape(list_phase_pressure_recomb, (-1, 4, 3, 32))

            lane_conv = tf.nn.conv2d(
                feature_map,
                weights['feature_conv_w1'], [1, 1, 1, 1],
                'VALID',
                name='feature_conv') + weights['feature_conv_b1']
            lane_conv = tf.nn.leaky_relu(lane_conv, name='feature_activation')

            # relation conv layer
            relation_conv = tf.nn.conv2d(
                relation_embedding,
                weights['phase_conv_w1'], [1, 1, 1, 1],
                'VALID',
                name='phase_conv') + weights['phase_conv_b1']
            relation_conv = tf.nn.leaky_relu(relation_conv,
                                             name='phase_activation')
            combine_feature = tf.multiply(lane_conv,
                                          relation_conv,
                                          name="combine_feature")

            # second conv layer
            hidden_layer = tf.nn.conv2d(combine_feature, weights['combine_conv_w1'], [1, 1, 1, 1], 'VALID', name='combine_conv') + \
                        weights['combine_conv_b1']
            hidden_layer = tf.nn.leaky_relu(hidden_layer,
                                            name='combine_activation')

            before_merge = tf.nn.conv2d(hidden_layer, weights['final_conv_w1'], [1, 1, 1, 1], 'VALID',
                                        name='final_conv') + \
                           weights['final_conv_b1']

            before_merge = tf.nn.leaky_relu(before_merge,
                                            name='combine_activation')

            #if self.num_actions == 8:
            #    _shape = (-1, 8, 7)
            #else:
            #    _shape = (-1, 4, 3)
            _shape = (-1, 8, 7)
            before_merge = tf.reshape(before_merge, _shape)
            out = tf.reduce_sum(before_merge, axis=2)

        return out
Exemplo n.º 2
0
class DQfD:
    def __init__(self, env, config, demo_transitions=None):
        self.sess = tf.InteractiveSession()
        self.config = config
        # replay_memory stores both demo data and generated data, while demo_memory only store demo data
        self.replay_memory = Memory(capacity=self.config.replay_buffer_size,
                                    permanent_data=len(demo_transitions))
        self.demo_memory = Memory(capacity=self.config.demo_buffer_size,
                                  permanent_data=self.config.demo_buffer_size)
        self.add_demo_to_memory(
            demo_transitions=demo_transitions
        )  # add demo data to both demo_memory & replay_memory
        self.time_step = 0
        self.epsilon = self.config.INITIAL_EPSILON
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.action_batch = tf.placeholder("int32", [None])
        self.y_input = tf.placeholder("float", [None, self.action_dim])
        self.ISWeights = tf.placeholder("float", [None, 1])
        self.n_step_y_input = tf.placeholder(
            "float", [None, self.action_dim])  # for n-step reward
        self.isdemo = tf.placeholder("float", [None])
        self.eval_input = tf.placeholder("float", [None, self.state_dim])
        self.select_input = tf.placeholder("float", [None, self.state_dim])

        self.Q_eval
        self.Q_select

        self.loss
        self.optimize
        self.update_target_net
        self.abs_errors

        self.saver = tf.train.Saver()

        self.sess.run(tf.global_variables_initializer())

        self.save_model()
        self.restore_model()

    def add_demo_to_memory(self, demo_transitions):
        # add demo data to both demo_memory & replay_memory
        for t in demo_transitions:
            self.demo_memory.store(np.array(t, dtype=object))
            self.replay_memory.store(np.array(t, dtype=object))
            assert len(t) == 10

    # use the expert-demo-data to pretrain
    def pre_train(self):
        print('Pre-training ...')
        for i in range(self.config.PRETRAIN_STEPS):
            self.train_Q_network(pre_train=True)
            if i % 200 == 0 and i > 0:
                print('{} th step of pre-train finish ...'.format(i))
        self.time_step = 0
        print('All pre-train finish.')

    # TODO: How to add the variable created in tf.layers.dense to the customed collection?
    # def build_layers(self, state, collections, units_1, units_2, w_i, b_i, regularizer=None):
    #     with tf.variable_scope('dese1'):
    #         dense1 = tf.layers.dense(tf.contrib.layers.flatten(state), activation=tf.nn.relu, units=units_1,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     with tf.variable_scope('dens2'):
    #         dense2 = tf.layers.dense(dense1, activation=tf.nn.relu, units=units_2,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     with tf.variable_scope('dene3'):
    #         dense3 = tf.layers.dense(dense2, activation=tf.nn.relu, units=self.action_dim,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     return dense3

    def build_layers(self,
                     state,
                     c_names,
                     units_1,
                     units_2,
                     w_i,
                     b_i,
                     reg=None):
        a_d = self.action_dim
        with tf.variable_scope('l1'):
            w1 = tf.get_variable('w1', [self.state_dim, units_1],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b1 = tf.get_variable('b1', [1, units_1],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense1 = tf.nn.relu(tf.matmul(state, w1) + b1)
        with tf.variable_scope('l2'):
            w2 = tf.get_variable('w2', [units_1, units_2],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b2 = tf.get_variable('b2', [1, units_2],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense2 = tf.nn.relu(tf.matmul(dense1, w2) + b2)
        with tf.variable_scope('l3'):
            w3 = tf.get_variable('w3', [units_2, a_d],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b3 = tf.get_variable('b3', [1, a_d],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense3 = tf.matmul(dense2, w3) + b3
        return dense3

    @lazy_property
    def Q_select(self):
        with tf.variable_scope('select_net') as scope:
            c_names = ['select_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            w_i = tf.random_uniform_initializer(-0.1, 0.1)
            b_i = tf.constant_initializer(0.1)
            reg = tf.contrib.layers.l2_regularizer(
                scale=0.2)  # Note: only parameters in select-net need L2
            return self.build_layers(self.select_input, c_names, 24, 24, w_i,
                                     b_i, reg)

    @lazy_property
    def Q_eval(self):
        with tf.variable_scope('eval_net') as scope:
            c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            w_i = tf.random_uniform_initializer(-0.1, 0.1)
            b_i = tf.constant_initializer(0.1)
            return self.build_layers(self.eval_input, c_names, 24, 24, w_i,
                                     b_i)

    def loss_l(self, ae, a):
        return 0.0 if ae == a else 0.8

    def loss_jeq(self, Q_select):
        jeq = 0.0
        for i in range(self.config.BATCH_SIZE):
            ae = self.action_batch[i]
            max_value = float("-inf")
            for a in range(self.action_dim):
                max_value = tf.maximum(Q_select[i][a] + self.loss_l(ae, a),
                                       max_value)
            jeq += self.isdemo[i] * (max_value - Q_select[i][ae])
        return jeq

    @lazy_property
    def loss(self):
        l_dq = tf.reduce_mean(
            tf.squared_difference(self.Q_select, self.y_input))
        l_n_dq = tf.reduce_mean(
            tf.squared_difference(self.Q_select, self.n_step_y_input))
        # l_n_step_dq = self.loss_n_step_dq(self.Q_select, self.n_step_y_input)
        l_jeq = self.loss_jeq(self.Q_select)
        l_l2 = tf.reduce_sum([
            tf.reduce_mean(reg_l)
            for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        ])
        return self.ISWeights * tf.reduce_sum([
            l * λ
            for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA)
        ])

    @lazy_property
    def abs_errors(self):
        return tf.reduce_sum(tf.abs(self.y_input - self.Q_select),
                             axis=1)  # only use 1-step R to compute abs_errors

    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE)
        return optimizer.minimize(
            self.loss)  # only parameters in select-net is optimized here

    @lazy_property
    def update_target_net(self):
        select_params = tf.get_collection('select_net_params')
        eval_params = tf.get_collection('eval_net_params')
        return [tf.assign(e, s) for e, s in zip(eval_params, select_params)]

    def save_model(self):
        print("Model saved in : {}".format(
            self.saver.save(self.sess, self.config.MODEL_PATH)))

    def restore_model(self):
        self.saver.restore(self.sess, self.config.MODEL_PATH)
        print("Model restored.")

    def perceive(self, transition):
        self.replay_memory.store(np.array(transition))
        # epsilon->FINAL_EPSILON(min_epsilon)
        if self.replay_memory.full():
            self.epsilon = max(self.config.FINAL_EPSILON,
                               self.epsilon * self.config.EPSILIN_DECAY)

    def train_Q_network(self, pre_train=False, update=True):
        """
        :param pre_train: True means should sample from demo_buffer instead of replay_buffer
        :param update: True means the action "update_target_net" executes outside, and can be ignored in the function
        """
        if not pre_train and not self.replay_memory.full(
        ):  # sampling should be executed AFTER replay_memory filled
            return
        self.time_step += 1

        assert self.replay_memory.full() or pre_train

        actual_memory = self.demo_memory if pre_train else self.replay_memory
        tree_idxes, minibatch, ISWeights = actual_memory.sample(
            self.config.BATCH_SIZE)

        np.random.shuffle(minibatch)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        demo_data = [data[5] for data in minibatch]
        n_step_reward_batch = [data[6] for data in minibatch]
        n_step_state_batch = [data[7] for data in minibatch]
        n_step_done_batch = [data[8] for data in minibatch]
        actual_n = [data[9] for data in minibatch]

        # provide for placeholder,compute first
        Q_select = self.Q_select.eval(
            feed_dict={self.select_input: next_state_batch})
        Q_eval = self.Q_eval.eval(
            feed_dict={self.eval_input: next_state_batch})
        n_step_Q_select = self.Q_select.eval(
            feed_dict={self.select_input: n_step_state_batch})
        n_step_Q_eval = self.Q_eval.eval(
            feed_dict={self.eval_input: n_step_state_batch})

        y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        for i in range(self.config.BATCH_SIZE):
            # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t
            temp = self.Q_select.eval(
                feed_dict={
                    self.select_input: state_batch[i].reshape((-1,
                                                               self.state_dim))
                })[0]
            temp_0 = np.copy(temp)
            # add 1-step reward
            action = np.argmax(Q_select[i])
            temp[action_batch[i]] = reward_batch[i] + (
                1 - int(done_batch[i])) * self.config.GAMMA * Q_eval[i][action]
            y_batch[i] = temp
            # add n-step reward
            action = np.argmax(n_step_Q_select[i])
            q_n_step = (
                1 - int(n_step_done_batch[i])
            ) * self.config.GAMMA**actual_n[i] * n_step_Q_eval[i][action]
            temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step
            n_step_y_batch[i] = temp_0

        _, abs_errors = self.sess.run(
            [self.optimize, self.abs_errors],
            feed_dict={
                self.y_input: y_batch,
                self.n_step_y_input: n_step_y_batch,
                self.select_input: state_batch,
                self.action_batch: action_batch,
                self.isdemo: demo_data,
                self.ISWeights: ISWeights
            })

        self.replay_memory.batch_update(
            tree_idxes, abs_errors)  # update priorities for data in memory

        # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制
        if update and self.time_step % self.config.UPDATE_TARGET_NET == 0:
            self.sess.run(self.update_target_net)

    def egreedy_action(self, state):
        if random.random() <= self.epsilon:
            return random.randint(0, self.action_dim - 1)
        return np.argmax(
            self.Q_select.eval(feed_dict={self.select_input: [state]})[0])