targets_mb = np.array([each for each in target_Qs_batch])

                _, loss, absolute_errors = sess.run(
                    [
                        DQNetwork.optimizer, DQNetwork.loss,
                        DQNetwork.absolute_errors
                    ],
                    feed_dict={
                        DQNetwork.inputs_: states_mb,
                        DQNetwork.target_Q: targets_mb,
                        DQNetwork.actions_: actions_mb,
                        DQNetwork.ISWeights_: ISWeights_mb
                    })

                # Update priority
                memory.batch_update(tree_idx, absolute_errors)

                # Write TF Summaries
                summary = sess.run(write_op,
                                   feed_dict={
                                       DQNetwork.inputs_: states_mb,
                                       DQNetwork.target_Q: targets_mb,
                                       DQNetwork.actions_: actions_mb,
                                       DQNetwork.ISWeights_: ISWeights_mb
                                   })
                writer.add_summary(summary, episode)
                writer.flush()

                if tau > max_tau:
                    # Update the parameters of our TargetNetwork with DQN_weights
                    update_target = update_target_graph()
Пример #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = Memory(BUFFER_SIZE)
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Here we'll deal with the empty memory problem: we pre-populate our memory
        # by taking random actions and storing the experience.
        self.tree_idx = None

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        e = self.experience(state, action, reward, next_state, done)
        self.memory.store(e)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:

            # Obtain random mini-batch from memory
            self.tree_idx, batch, ISWeights_mb = self.memory.sample(BATCH_SIZE)

            states = torch.from_numpy(np.vstack([each[0][0] for each in batch
                                                 ])).float().to(device)
            actions = torch.from_numpy(
                np.vstack([each[0][1] for each in batch])).long().to(device)
            rewards = torch.from_numpy(
                np.stack([[each[0][2]] for each in batch])).float().to(device)
            next_states = torch.from_numpy(
                np.vstack([each[0][3] for each in batch])).float().to(device)
            dones = torch.from_numpy(
                np.stack([[each[0][4]] for each in batch
                          ]).astype(np.uint8)).float().to(device)

            experiences = (states, actions, rewards, next_states, dones)

            self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Choose actions according to local network

        next_actions = self.qnetwork_local(next_states).argmax(dim=1)

        # Choose values from target network
        Q_targets_next = self.qnetwork_target(next_states).detach()[
            np.arange(BATCH_SIZE), next_actions].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update memory having the batch loss as priority value
        batch_loss = np.ones(BATCH_SIZE) * loss.data.cpu().numpy()

        self.memory.batch_update(self.tree_idx, batch_loss)

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Пример #3
0
class DQN(object):
    '''
    DQN结构,输入为每一时刻machine的ram值
    '''
    def __init__(
        self,
        n_actions,  # 动作数量
        n_features,  # 每个state所有observation的数量
        learning_rate=0.005,
        reward_decay=0.9,  # gamma,奖励衰减值
        e_greedy=0.9,  # 贪婪值,用来决定是使用贪婪模式还是随机模式
        replace_target_iter=500,  # Target_Net更行轮次
        memory_size=10000,  # 记忆库大小
        batch_size=32,
        e_greedy_increment=None,
        output_graph=False,
        prioritized=True,  # 是否使用优先记忆
        sess=None,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.target_net_update_period = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.prioritized = prioritized  # 是否是用优先级记忆

        self.global_step_counter = 0

        self.build_net()

        t_params = tf.get_collection('target_net_params')
        q_params = tf.get_collection('q_net_params')
        self.update_target_net = [
            tf.assign(t, e) for t, e in zip(t_params, q_params)
        ]

        if self.prioritized:  # 使用SumTree
            self.memory = Memory(
                capacity=memory_size)  # 构建一个容量为memory size的记忆库
        else:  # 不使用优先级记忆策略,用一个numpy数组表示记忆
            self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess

        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.cost_his = []

    def build_net(self):
        '''
        创建两个神经网络
        :return:
        '''
        self.input_state = tf.placeholder(tf.float32, [None, self.n_features],
                                          name='input_state')
        self.output_target = tf.placeholder(tf.float32, [None, self.n_actions],
                                            name='output_target')
        self.input_weights = tf.placeholder(
            tf.float32, [None, 1], name='IS_weights')  # 每个训练数据在计算loss时的权重

        # 构建Q_Net
        with tf.variable_scope('q_net'):
            c_names = ['q_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            self.q_eval = self.build_layers(self.input_state, c_names, True)
        # 构建Q_Net的训练loss

        with tf.variable_scope('loss'):
            self.abs_errors = tf.reduce_sum(tf.abs(self.output_target -
                                                   self.q_eval),
                                            axis=1)
            self.loss = tf.reduce_mean(
                self.input_weights *
                tf.squared_difference(self.output_target, self.q_eval))
        # 构建Q_Net 的训练操作

        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
                self.loss)

        # 初始化并构建Target_Net
        self.input_state_ = tf.placeholder(tf.float32, [None, self.n_features],
                                           name='s_')
        with tf.variable_scope('target_net'):  # 构建Target_Net
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            self.q_next = self.build_layers(self.input_state_, c_names, False)

    def build_layers(self, s, c_names, trainable):
        '''构建一个双隐藏层神经网络'''
        w_initializer, b_initializer = tf.random_normal_initializer(
            0., 0.3), tf.constant_initializer(0.1)
        with tf.variable_scope('l1'):
            w1 = tf.get_variable('w1', [self.n_features, hidden_size],
                                 initializer=w_initializer,
                                 collections=c_names,
                                 trainable=trainable)
            b1 = tf.get_variable('b1', [1, hidden_size],
                                 initializer=b_initializer,
                                 collections=c_names,
                                 trainable=trainable)
            l1 = tf.nn.relu(tf.matmul(s, w1) + b1)

        with tf.variable_scope('l2'):
            w2 = tf.get_variable('w2', [hidden_size, self.n_actions],
                                 initializer=w_initializer,
                                 collections=c_names,
                                 trainable=trainable)
            b2 = tf.get_variable('b2', [1, self.n_actions],
                                 initializer=b_initializer,
                                 collections=c_names,
                                 trainable=trainable)
            out = tf.matmul(l1, w2) + b2
        return out

    # 将从环境中获得的记忆数据存储到DQN的记忆库中
    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))  # 将数据转换成array
        self.memory.store(transition)

    def choose_action(self, observation):
        '''根据输入的state选择行为,90%的概率选择最优行为,10%概率随机'''
        observation = observation[np.newaxis, :]
        if np.random.uniform() < self.epsilon:
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.input_state: observation})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def learn(self):
        if self.global_step_counter % self.target_net_update_period == 0:
            self.sess.run(self.update_target_net)

        tree_idx, batch_memory, memory_weights = self.memory.sample(
            self.batch_size)

        feed = {
            self.input_state_: batch_memory[:, -self.n_features:],
            self.input_state: batch_memory[:, :self.n_features]
        }
        q_next, q_eval = self.sess.run([self.q_next, self.q_eval],
                                       feed_dict=feed)  # 正向传播Q_Net和Target_Net

        # 只计算被选择的ation对应的loss,其他action产生的loss记为0
        output_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]
        output_target[
            batch_index,
            eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        # 获得本次训练的loss,进而将其更新到SumTree中
        feed = {
            self.input_state: batch_memory[:, :self.n_features],
            self.output_target: output_target,
            self.input_weights: memory_weights
        }
        _, abs_errors, self.cost = self.sess.run(
            [self._train_op, self.abs_errors, self.loss], feed_dict=feed)
        self.memory.batch_update(tree_idx, abs_errors)  # 将训练过的记忆数据更新到SumTree中

        self.cost_his.append(self.cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.global_step_counter += 1
Пример #4
0
class DQN2(object):
    '''
    DQN结构,该DQN输入数据为大小为(210, 160, 3)的图片,使用CNN结构
    '''
    def __init__(
        self,
        n_actions,  # 动作数量 9
        image_shape,  # 输入image的大小(210, 160, 3)
        learning_rate=0.005,
        reward_decay=0.9,  # gamma,奖励衰减值
        e_greedy=0.9,  # 贪婪值,用来决定是使用贪婪模式还是随机模式
        replace_target_iter=500,  # Target_Net更行轮次
        memory_size=10000,  # 记忆库大小
        batch_size=32,
        e_greedy_increment=None,
        output_graph=False,
        prioritized=True,  # 是否使用优先记忆
        sess=None,
    ):
        self.n_actions = n_actions
        self.image_shape = image_shape.insert(0, None)
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.target_net_update_period = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.prioritized = prioritized  # 是否是用优先级记忆

        self.global_step_counter = 0

        self.build_net()

        t_params = tf.get_collection('target_net_params')
        q_params = tf.get_collection('q_net_params')
        self.update_target_net = [
            tf.assign(t, e) for t, e in zip(t_params, q_params)
        ]

        if self.prioritized:  # 使用SumTree
            self.memory = Memory(
                capacity=memory_size)  # 构建一个容量为memory size的记忆库
        else:  # 不使用优先级记忆策略,用一个numpy数组表示记忆
            self.memory = np.zeros((self.memory_size, image_shape * 2 + 2))

        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess

        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.cost_his = []

    def build_net(self):
        '''
        创建两个神经网络
        :return:
        '''
        self.input_state = tf.placeholder(tf.float32,
                                          self.image_shape,
                                          name='input_state')
        self.output_target = tf.placeholder(tf.float32, [None, self.n_actions],
                                            name='output_target')
        self.input_weights = tf.placeholder(
            tf.float32, [None, 1], name='IS_weights')  # 每个训练数据在计算loss时的权重

        # 构建Q_Net
        with tf.variable_scope('q_net'):
            c_names = ['q_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            self.q_eval = self.build_layers(self.input_state, c_names, True)
        # 构建Q_Net的训练loss
        with tf.variable_scope('loss'):
            self.abs_errors = tf.reduce_sum(tf.abs(self.output_target -
                                                   self.q_eval),
                                            axis=1)
            self.loss = tf.reduce_mean(
                self.input_weights *
                tf.squared_difference(self.output_target, self.q_eval))
        # 构建Q_Net 的训练操作
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
                self.loss)

        # 初始化并构建Target_Net
        self.input_state_ = tf.placeholder(tf.float32,
                                           self.image_shape,
                                           name='s_')
        with tf.variable_scope('target_net'):  # 构建Target_Net
            c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            self.q_next = self.build_layers(self.input_state_, c_names, False)

    def build_layers(self, s, c_names, trainable):
        '''
        构建一个包含两个卷积层,两个最大池化层,两个全连接层的CNN
        '''
        # TODO: 修改每个weights的大小
        w_initializer = tf.random_normal_initializer(0., 0.3)
        b_initializer = tf.constant_initializer(0.1)
        weights = {
            'conv1':
            tf.get_variable('conv_w1',
                            shape=[4, 4, 3, 6],
                            initializer=w_initializer,
                            collections=c_names,
                            trainable=trainable),
            'conv2':
            tf.get_variable('conv_w2',
                            shape=[4, 4, 6, 12],
                            initializer=w_initializer,
                            collections=c_names,
                            trainable=trainable),
            'h1':
            tf.get_variable('h_w1',
                            shape=hidden_size,
                            initializer=w_initializer,
                            collections=c_names,
                            trainable=trainable),
            'h2':
            tf.get_variable('h_w2',
                            shape=hidden_size,
                            initializer=w_initializer,
                            collections=c_names,
                            trainable=trainable)
        }
        biases = {
            'conv1':
            tf.get_variable('conv_b1',
                            shape=self.image_shape,
                            initializer=b_initializer,
                            collections=c_names,
                            trainable=trainable),
            'conv2':
            tf.get_variable('conv_b2',
                            shape=self.image_shape,
                            initializer=b_initializer,
                            collections=c_names,
                            trainable=trainable),
            'h1':
            tf.get_variable('h_b1',
                            shape=hidden_size,
                            initializer=b_initializer,
                            collections=c_names,
                            trainable=trainable),
            'h2':
            tf.get_variable('h_b2',
                            shape=hidden_size,
                            initializer=b_initializer,
                            collections=c_names,
                            trainable=trainable)
        }
        with tf.variable_scope('conv_1'):
            conv1_layer = tf.nn.conv2d(s,
                                       weights['conv1'],
                                       strides=[1, 1, 1, 1],
                                       padding='SAME')
            pool1_layer = tf.nn.max_pool(conv1_layer, ksize=[2, 2])
            relu1_layer = tf.nn.relu(pool1_layer) + biases['conv1']

        with tf.variable_scope('conv_2'):
            conv2_layer = tf.nn.conv2d(relu1_layer,
                                       weights['conv1'],
                                       strides=[1, 1, 1, 1],
                                       padding='SAME')
            pool2_layer = tf.nn.max_pool(conv2_layer, ksize=[2, 2])
            relu2_layer = tf.nn.relu(pool2_layer) + biases['conv1']
        with tf.variable_scope('hidden_1'):
            padding_layer = tf.reshape(relu2_layer,
                                       shape=[self.batch_size, -1])
            h1_layer = tf.matmul(padding_layer, weights['h1']) + biases['h1']
            h1_layer = tf.nn.relu(h1_layer)
        with tf.variable_scope('hidden_2'):
            out = tf.matmul(h1_layer, weights['h2']) + biases['h2']
            return out

    # 将从环境中获得的记忆数据存储到DQN的记忆库中
    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))  # 将数据转换成array
        self.memory.store(transition)

    def choose_action(self, observation):
        '''根据输入的state选择行为,90%的概率选择最优行为,10%概率随机'''
        observation = observation[np.newaxis, :]
        if np.random.uniform() < self.epsilon:
            actions_value = self.sess.run(
                self.q_eval, feed_dict={self.input_state: observation})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def learn(self):
        if self.global_step_counter % self.target_net_update_period == 0:
            self.sess.run(self.update_target_net)

        tree_idx, batch_memory, memory_weights = self.memory.sample(
            self.batch_size)

        feed = {
            self.input_state_: batch_memory[:, -self.image_shape:],
            self.input_state: batch_memory[:, :self.image_shape]
        }
        q_next, q_eval = self.sess.run([self.q_next, self.q_eval],
                                       feed_dict=feed)  # 正向传播Q_Net和Target_Net

        # 只计算被选择的ation对应的loss,其他action产生的loss记为0
        output_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.image_shape].astype(int)
        reward = batch_memory[:, self.image_shape + 1]
        output_target[
            batch_index,
            eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        # 获得本次训练的loss,进而将其更新到SumTree中
        feed = {
            self.input_state: batch_memory[:, :self.image_shape],
            self.output_target: output_target,
            self.input_weights: memory_weights
        }
        _, abs_errors, self.cost = self.sess.run(
            [self._train_op, self.abs_errors, self.loss], feed_dict=feed)
        self.memory.batch_update(tree_idx, abs_errors)  # 将训练过的记忆数据更新到SumTree中

        self.cost_his.append(self.cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.global_step_counter += 1
Пример #5
0
class Agent:
    
    def __init__(self, input_dim, output_dim, lr, gamma, tau, buffer_size, l1_units, l2_units, l3_units, rnd_seed):

        self.buffer_size = buffer_size
        self.memory = Memory(self.buffer_size)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.actions = range(output_dim)  
        self.gamma = gamma
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = lr
        self.tau = tau
        self.l1_units = l1_units
        self.l2_units = l2_units
        self.l3_units = l3_units
        random.seed(rnd_seed)

        self.model, self.init_weights = self.create_model()
        self.target_model, self.target_init_weights = self.create_model()

    def xplr(self):
        self.epsilon = (self.epsilon_min-1)/self.epsilon_const*self.epi+1
        self.epsilon = max(self.epsilon, self.epsilon_min)
        self.epi += 1
    
    def create_model(self):
        model   = Sequential()
        model.add(Dense(self.l1_units, input_dim = self.input_dim, activation="relu"))
        model.add(Dense(self.l2_units, activation="relu"))
        model.add(Dense(self.l3_units, activation="relu"))
        model.add(Dense(self.output_dim))
        model.compile(loss=huber_loss, optimizer=Adam(lr=self.learning_rate))
        init_weights = model.get_weights()
        return model, init_weights

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return np.random.choice(self.actions)
        return self.model.predict(state)[0]

    def remember(self, state, action, reward, new_state, done):
        experience = state, action, reward, new_state, done
        self.memory.store(experience)

    def replay(self):
        batch_size = 32
        states = []
        targets = []
        TD_errors = []
        tree_idx, batch, ISWeights_mb = self.memory.sample(batch_size)
        states_mb = np.array([each[0][0] for each in batch])
        actions_mb = np.array([each[0][1] for each in batch])
        rewards_mb = np.array([each[0][2] for each in batch]) 
        next_states_mb = np.array([each[0][3] for each in batch])
        dones_mb = np.array([each[0][4] for each in batch])
        for q in range(batch_size):
            state, action, reward, next_state, done = states_mb[q], actions_mb[q], rewards_mb[q], next_states_mb[q], dones_mb[q]
            target = self.target_model.predict(state)
            if done:
                TD_target = reward/100
            else:
                Q_future = max(self.target_model.predict(next_state)[0])
                TD_target = np.clip(reward/100 + Q_future * self.gamma, -1, 0)
            TD_error = TD_target - target[0][action]
            TD_errors.append(TD_error)
            target[0][action] = TD_target
            states.append(state[0])
            targets.append(target[0])
        states = np.array(states)
        targets = np.array(targets)
        self.model.fit(states, targets, epochs=1, verbose=0)
        self.memory.batch_update(tree_idx, np.abs(TD_errors))

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)
        
    def reset_weights(self, reset_weights):
        if reset_weights:
            self.model.set_weights(self.init_weights)
            self.target_model.set_weights(self.target_init_weights)
        self.memory = Memory(self.buffer_size)
        self.epsilon = 1.0
Пример #6
0
class Agent():
    def __init__(self,
                 sess,
                 n_features,
                 config,
                 dic_traffic_env_conf,
                 demo=None,
                 lr=0.01):
        self.sess = sess
        self.config = config
        self._activation_fn = tf.nn.leaky_relu
        self.dic_traffic_env_conf = dic_traffic_env_conf

        # replay buffer
        self.replay_memory = Memory(capacity=self.config.replay_buffer_size,
                                    permanent_data=len(demo))
        # self.replay_memory = None
        self.demo_memory = Memory(capacity=self.config.demo_buffer_size,
                                  permanent_data=self.config.demo_buffer_size)
        self.add_demo_to_memory(demo_transitions=demo)
        self.state_dim = 16
        self.action_dim = 8

        self.s = tf.placeholder(tf.float32, [None, n_features], "state")
        self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next")
        self.q_a_ = tf.placeholder(tf.float32, [None, 1], "q_next")
        self.r = tf.placeholder(tf.float32, [None, 1], 'r')
        self.a = tf.placeholder(tf.int32, [None, 1], 'act')
        self.act_probs = tf.placeholder(tf.float32, [None, 8], 'act_probs')

        self.action_batch = tf.placeholder("int32", [None])
        self.y_input = tf.placeholder("float", [None, self.action_dim])
        self.ISWeights = tf.placeholder("float", [None, 1])
        self.n_step_y_input = tf.placeholder(
            "float", [None, self.action_dim])  # for n-step reward
        self.isdemo = tf.placeholder("float", [None])

        self.td = tf.placeholder(tf.float32, [None, 1], "td_error")  # TD_error
        self.expert_action = tf.placeholder(tf.float32, [None, 8],
                                            "expert_action")

        self.hidden = self.construct_forward(self.s,
                                             True,
                                             'None',
                                             True,
                                             "hidden",
                                             prefix='fc')

        with tf.variable_scope('Q-Value'):
            self.q = tf.layers.dense(
                inputs=self.hidden,
                units=8,  # number of hidden units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0.,
                                                                .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='Q')

        with tf.variable_scope('Q-Target'):
            self.q_target = tf.layers.dense(
                inputs=self.hidden,
                units=8,  # number of hidden units
                activation=None,
                kernel_initializer=tf.random_normal_initializer(0.,
                                                                .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='Q-Target')

        with tf.variable_scope('Actor'):
            self.probs = tf.layers.dense(
                inputs=self.hidden,
                units=8,  # output units
                activation=tf.nn.softmax,  # get action probabilities
                kernel_initializer=tf.random_normal_initializer(0.,
                                                                .1),  # weights
                bias_initializer=tf.constant_initializer(0.1),  # biases
                name='acts_prob')

        # self.v = self.build_net("Value")
        # self.q = self.construct_forward(self.s, True, 'None', True, "Q-Value", prefix='fc')
        # self.q_target = self.construct_forward(self.s, True, 'None', True, "Q-Target", prefix='fc')
        # self.q = self.build_q_net("Q-Value")
        # self.q_target = self.build_q_net("Q-Target")
        self.q_a = tf.batch_gather(self.q, self.a)
        self.v = tf.reduce_sum(self.q * self.act_probs, axis=1, keepdims=True)
        # self.v_target = self.build_net("Target")
        self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='Q-Target')
        self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope='Q-Value')
        self.replace_target_op = [
            tf.assign(t, p) for t, p in zip(self.t_params, self.params)
        ]

        self.loss
        self.optimize
        self.update_target_net
        self.abs_errors
        self.time_step = 0

        with tf.variable_scope('squared_TD_error'):
            # self.td_error = self.r + 0.8 * self.v_ - self.v
            self.td_error = self.q_a - self.v
            q_loss = tf.reduce_mean(
                tf.squared_difference(self.q_a, self.r + 0.8 * self.q_a_))
            # v_loss = tf.reduce_mean(tf.squared_difference(self.v, self.r + 0.8 * self.v_))
            # self.loss = tf.square(self.td_error)  # TD_error = (r+gamma*V_next) - V_eval
            # self.loss = q_loss + v_loss
            self.los = q_loss
        with tf.variable_scope('train-c'):
            self.train_op_critic = tf.train.AdamOptimizer(lr).minimize(
                self.los)

        with tf.variable_scope('exp_v'):
            # log_prob = tf.log(self.acts_prob[0, self.a])
            log_prob = tf.log(tf.batch_gather(self.probs, self.a))
            self.exp_v = tf.reduce_mean(
                log_prob * self.td)  # advantage (TD_error) guided loss

        self.action = gumbel_softmax(logits=self.probs,
                                     temperature=1,
                                     hard=False)

        with tf.variable_scope('train-a'):
            self.train_op_actor = tf.train.AdamOptimizer(lr).minimize(
                -self.exp_v)  # minimize(-exp_v) = maximize(exp_v)

        self.pretrain_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self.action, labels=self.expert_action)
        self.pretrain_op = tf.train.AdamOptimizer(lr).minimize(
            self.pretrain_loss)

    def add_demo_to_memory(self, demo_transitions):
        # add demo data to both demo_memory & replay_memory
        for t in demo_transitions:
            self.demo_memory.store(np.array(t, dtype=object))
            self.replay_memory.store(np.array(t, dtype=object))
            assert len(t) == 10

    # use the expert-demo-data to pretrain
    def pre_train(self):
        print('Pre-training ...')
        for i in range(self.config.PRETRAIN_STEPS):
            self.train_Q_network(pre_train=True)
            if i % 200 == 0 and i > 0:
                print('{} th step of pre-train finish ...'.format(i))
        self.time_step = 0
        print('All pre-train finish.')

    @lazy_property
    def abs_errors(self):
        return tf.reduce_sum(tf.abs(self.y_input - self.q),
                             axis=1)  # only use 1-step R to compute abs_errors

    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE)
        return optimizer.minimize(
            self.loss)  # only parameters in select-net is optimized here

    @lazy_property
    def update_target_net(self):
        select_params = tf.get_collection('Q-Value')
        eval_params = tf.get_collection('Q-Target')
        return [tf.assign(e, s) for e, s in zip(eval_params, select_params)]

    def learn_critic(self, s, r, s_, a, next_a, probs):
        s, s_, r = s[np.newaxis, :], s_[np.newaxis, :], r[np.newaxis, :]
        a, next_a = a[np.newaxis, :], next_a[np.newaxis, :]

        # v_ = self.sess.run(self.v, {self.s: s_})
        q_a_ = self.sess.run(self.q_a, {self.s: s_, self.a: next_a})
        td_error, _ = self.sess.run(
            [self.td_error, self.train_op_critic], {
                self.s: s,
                self.r: r,
                self.act_probs: probs,
                self.q_a_: q_a_,
                self.a: a
            })
        return td_error

    def loss_l(self, ae, a):
        return 0.0 if ae == a else 0.8

    def loss_jeq(self, q):
        jeq = 0.0
        for i in range(self.config.BATCH_SIZE):
            ae = self.action_batch[i]
            max_value = float("-inf")
            for a in range(self.action_dim):
                max_value = tf.maximum(q[i][a] + self.loss_l(ae, a), max_value)
            jeq += self.isdemo[i] * (max_value - q[i][ae])
        return jeq

    @lazy_property
    def loss(self):
        l_dq = tf.reduce_mean(tf.squared_difference(self.q, self.y_input))
        l_n_dq = tf.reduce_mean(
            tf.squared_difference(self.q, self.n_step_y_input))
        l_jeq = self.loss_jeq(self.q)
        l_l2 = tf.reduce_sum([
            tf.reduce_mean(reg_l)
            for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        ])
        return self.ISWeights * tf.reduce_sum([
            l * λ
            for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA)
        ])

    def train_Q_network(self, pre_train=False, update=True):
        """
        :param pre_train: True means should sample from demo_buffer instead of replay_buffer
        :param update: True means the action "update_target_net" executes outside, and can be ignored in the function
        """
        if not pre_train and not self.replay_memory.full(
        ):  # sampling should be executed AFTER replay_memory filled
            return
        self.time_step += 1

        assert self.replay_memory.full() or pre_train

        actual_memory = self.demo_memory if pre_train else self.replay_memory
        tree_idxes, minibatch, ISWeights = actual_memory.sample(
            self.config.BATCH_SIZE)

        np.random.shuffle(minibatch)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        demo_data = [data[5] for data in minibatch]
        n_step_reward_batch = [data[6] for data in minibatch]
        n_step_state_batch = [data[7] for data in minibatch]
        n_step_done_batch = [data[8] for data in minibatch]
        actual_n = [data[9] for data in minibatch]

        # provide for placeholder,compute first
        q_next = self.q.eval(feed_dict={self.s: next_state_batch})
        q_target_next = self.q_target.eval(
            feed_dict={self.s: next_state_batch})

        n_step_q_next = self.q.eval(feed_dict={self.s: n_step_state_batch})
        n_step_q_target_next = self.q_target.eval(
            feed_dict={self.s: n_step_state_batch})

        y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        # td_error_batch = np.zeros((self.config.BATCH_SIZE, 1))
        for i in range(self.config.BATCH_SIZE):
            # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t
            temp = self.q.eval(
                feed_dict={
                    self.s: state_batch[i].reshape((-1, self.state_dim))
                })[0]
            # v = np.sum(temp, action_prob_batch[i])
            # td_error_batch[i] = temp[action_batch[i]] - v
            temp_0 = np.copy(temp)
            # add 1-step reward
            action = np.argmax(q_next[i])
            # action = next_action_batch[i]
            temp[action_batch[i]] = reward_batch[i] + (1 - int(
                done_batch[i])) * self.config.GAMMA * q_target_next[i][action]
            y_batch[i] = temp
            # add n-step reward
            action = np.argmax(n_step_q_next[i])
            q_n_step = (
                1 - int(n_step_done_batch[i])) * self.config.GAMMA**actual_n[
                    i] * n_step_q_target_next[i][action]
            temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step
            n_step_y_batch[i] = temp_0

        _, abs_errors = self.sess.run(
            [self.optimize, self.abs_errors],
            feed_dict={
                self.y_input: y_batch,
                self.n_step_y_input: n_step_y_batch,
                self.s: state_batch,
                self.action_batch: action_batch,
                self.isdemo: demo_data,
                self.ISWeights: ISWeights
            })

        self.replay_memory.batch_update(
            tree_idxes, abs_errors)  # update priorities for data in memory

        # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制
        if update and self.time_step % self.config.UPDATE_TARGET_NET == 0:
            self.sess.run(self.update_target_net)

        return state_batch, action_batch

    def learn_actor(self, s, a, td):
        s = s[np.newaxis, :]
        a = a[np.newaxis, :]
        # td = td[np.newaxis, :]
        feed_dict = {self.s: s, self.a: a, self.td: td}
        _, exp_v = self.sess.run([self.train_op_actor, self.exp_v], feed_dict)
        return exp_v

    def choose_action(self, s):
        s = s[np.newaxis, :]
        probs = self.sess.run(self.probs, {self.s: s})  # 获取所有操作的概率
        return np.random.choice(np.arange(probs.shape[1]),
                                p=probs.ravel()), probs  # return a int

    def pretrain(self, state, action):
        print("Pre-training for Actor!")
        expert_action_batch = np.zeros((self.batch_size, 8))
        for i, a in enumerate(action):
            expert_action_batch[i, a] = 1
        self.sess.run(self.pretrain_op, {
            self.s: state,
            self.expert_action: expert_action_batch
        })

    def contruct_layer(self, inp, activation_fn, reuse, norm, is_train, scope):
        if norm == 'batch_norm':
            out = tf.contrib.layers.batch_norm(inp,
                                               activation_fn=activation_fn,
                                               reuse=reuse,
                                               is_training=is_train,
                                               scope=scope)
        elif norm == 'None':
            out = activation_fn(inp)
        else:
            ValueError('Can\'t recognize {}'.format(norm))
        return out

    def construct_weights(self):
        weights = {}

        weights['embed_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 4]), name='embed_w1')
        weights['embed_b1'] = tf.Variable(tf.zeros([4]), name='embed_b1')

        # for phase, one-hot
        weights['embed_w2'] = tf.Variable(tf.random_uniform_initializer(
            minval=-0.05, maxval=0.05)([2, 4]),
                                          name='embed_w2')
        #weights['embed_b2'] = tf.Variable(tf.zeros([4]), name='embed_b2')

        # lane embeding
        weights['lane_embed_w3'] = tf.Variable(
            tf.glorot_uniform_initializer()([8, 16]), name='lane_embed_w3')
        weights['lane_embed_b3'] = tf.Variable(tf.zeros([16]),
                                               name='lane_embed_b3')

        # relation embeding, one-hot
        weights['relation_embed_w4'] = tf.Variable(
            tf.random_uniform_initializer(minval=-0.05, maxval=0.05)([2, 4]),
            name='relation_embed_w4')
        #weights['relation_embed_b4'] = tf.Variable(tf.zeros([4]), name='relation_embed_b4')

        weights['feature_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 32, 20]),
            name='feature_conv_w1')
        weights['feature_conv_b1'] = tf.Variable(tf.zeros([20]),
                                                 name='feature_conv_b1')

        weights['phase_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 4, 20]),
            name='phase_conv_w1')
        weights['phase_conv_b1'] = tf.Variable(tf.zeros([20]),
                                               name='phase_conv_b1')

        weights['combine_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 20, 20]),
            name='combine_conv_w1')
        weights['combine_conv_b1'] = tf.Variable(tf.zeros([20]),
                                                 name='combine_conv_b1')

        weights['final_conv_w1'] = tf.Variable(
            tf.glorot_uniform_initializer()([1, 1, 20, 1]),
            name='final_conv_w1')
        weights['final_conv_b1'] = tf.Variable(tf.zeros([1]),
                                               name='final_conv_b1')

        return weights

    def construct_forward(self,
                          inp,
                          reuse,
                          norm,
                          is_train,
                          scope,
                          prefix='fc'):
        # embedding, only for 4 or 8 phase, hard code for lane_num_vehicle + cur_phase
        with tf.variable_scope(scope):
            weights = self.construct_weights()
            dim = int(inp.shape[1].value / 2)
            num_veh = inp[:, :dim]
            num_veh = tf.reshape(num_veh, [-1, 1])

            phase = inp[:, dim:]
            phase = tf.cast(phase, tf.int32)
            phase = tf.one_hot(phase, 2)
            phase = tf.reshape(phase, [-1, 2])

            embed_num_veh = self.contruct_layer(
                tf.matmul(num_veh, weights['embed_w1']) + weights['embed_b1'],
                activation_fn=tf.nn.sigmoid,
                reuse=reuse,
                is_train=is_train,
                norm=norm,
                scope='num_veh_embed.' + prefix)
            embed_num_veh = tf.reshape(embed_num_veh, [-1, dim, 4])

            embed_phase = self.contruct_layer(tf.matmul(
                phase, weights['embed_w2']),
                                              activation_fn=tf.nn.sigmoid,
                                              reuse=reuse,
                                              is_train=is_train,
                                              norm=norm,
                                              scope='phase_embed.' + prefix)
            embed_phase = tf.reshape(embed_phase, [-1, dim, 4])

            dic_lane = {}
            for i, m in enumerate(self.dic_traffic_env_conf["LANE_PHASE_INFO"]
                                  ["start_lane"]):
                dic_lane[m] = tf.concat(
                    [embed_num_veh[:, i, :], embed_phase[:, i, :]], axis=-1)

            list_phase_pressure = []
            phase_startLane_mapping = self.dic_traffic_env_conf[
                "LANE_PHASE_INFO"]["phase_sameStartLane_mapping"]
            for phase in self.dic_traffic_env_conf["LANE_PHASE_INFO"]["phase"]:
                t1 = tf.Variable(tf.zeros(1))
                t2 = tf.Variable(tf.zeros(1))
                for lane in phase_startLane_mapping[phase][0]:
                    t1 += self.contruct_layer(
                        tf.matmul(dic_lane[lane], weights['lane_embed_w3']) +
                        weights['lane_embed_b3'],
                        activation_fn=self._activation_fn,
                        reuse=reuse,
                        is_train=is_train,
                        norm=norm,
                        scope='lane_embed.' + prefix)
                t1 /= len(phase_startLane_mapping[phase][0])

                if len(phase_startLane_mapping[phase]) >= 2:
                    for lane in phase_startLane_mapping[phase][1]:
                        t2 += self.contruct_layer(
                            tf.matmul(dic_lane[lane], weights['lane_embed_w3'])
                            + weights['lane_embed_b3'],
                            activation_fn=self._activation_fn,
                            reuse=reuse,
                            is_train=is_train,
                            norm=norm,
                            scope='lane_embed.' + prefix)
                    t2 /= len(phase_startLane_mapping[phase][1])

                list_phase_pressure.append(t1 + t2)
                # TODO check batch_size here
            constant = relation(self.dic_traffic_env_conf["LANE_PHASE_INFO"])

            constant = tf.one_hot(constant, 2)
            s1, s2 = constant.shape[1:3]
            constant = tf.reshape(constant, (-1, 2))
            relation_embedding = tf.matmul(constant,
                                           weights['relation_embed_w4'])
            relation_embedding = tf.reshape(relation_embedding,
                                            (-1, s1, s2, 4))

            list_phase_pressure_recomb = []
            num_phase = len(list_phase_pressure)

            for i in range(num_phase):
                for j in range(num_phase):
                    if i != j:
                        list_phase_pressure_recomb.append(
                            tf.concat([
                                list_phase_pressure[i], list_phase_pressure[j]
                            ],
                                      axis=-1,
                                      name="concat_compete_phase_%d_%d" %
                                      (i, j)))

            list_phase_pressure_recomb = tf.concat(list_phase_pressure_recomb,
                                                   axis=-1,
                                                   name="concat_all")
            feature_map = tf.reshape(list_phase_pressure_recomb,
                                     (-1, num_phase, num_phase - 1, 32))
            #if num_phase == 8:
            #    feature_map = tf.reshape(list_phase_pressure_recomb, (-1, 8, 7, 32))
            #else:
            #    feature_map = tf.reshape(list_phase_pressure_recomb, (-1, 4, 3, 32))

            lane_conv = tf.nn.conv2d(
                feature_map,
                weights['feature_conv_w1'], [1, 1, 1, 1],
                'VALID',
                name='feature_conv') + weights['feature_conv_b1']
            lane_conv = tf.nn.leaky_relu(lane_conv, name='feature_activation')

            # relation conv layer
            relation_conv = tf.nn.conv2d(
                relation_embedding,
                weights['phase_conv_w1'], [1, 1, 1, 1],
                'VALID',
                name='phase_conv') + weights['phase_conv_b1']
            relation_conv = tf.nn.leaky_relu(relation_conv,
                                             name='phase_activation')
            combine_feature = tf.multiply(lane_conv,
                                          relation_conv,
                                          name="combine_feature")

            # second conv layer
            hidden_layer = tf.nn.conv2d(combine_feature, weights['combine_conv_w1'], [1, 1, 1, 1], 'VALID', name='combine_conv') + \
                        weights['combine_conv_b1']
            hidden_layer = tf.nn.leaky_relu(hidden_layer,
                                            name='combine_activation')

            before_merge = tf.nn.conv2d(hidden_layer, weights['final_conv_w1'], [1, 1, 1, 1], 'VALID',
                                        name='final_conv') + \
                           weights['final_conv_b1']

            before_merge = tf.nn.leaky_relu(before_merge,
                                            name='combine_activation')

            #if self.num_actions == 8:
            #    _shape = (-1, 8, 7)
            #else:
            #    _shape = (-1, 4, 3)
            _shape = (-1, 8, 7)
            before_merge = tf.reshape(before_merge, _shape)
            out = tf.reduce_sum(before_merge, axis=2)

        return out
Пример #7
0
class DQfD:
    def __init__(self, env, config, demo_transitions=None):
        self.sess = tf.InteractiveSession()
        self.config = config
        # replay_memory stores both demo data and generated data, while demo_memory only store demo data
        self.replay_memory = Memory(capacity=self.config.replay_buffer_size,
                                    permanent_data=len(demo_transitions))
        self.demo_memory = Memory(capacity=self.config.demo_buffer_size,
                                  permanent_data=self.config.demo_buffer_size)
        self.add_demo_to_memory(
            demo_transitions=demo_transitions
        )  # add demo data to both demo_memory & replay_memory
        self.time_step = 0
        self.epsilon = self.config.INITIAL_EPSILON
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.action_batch = tf.placeholder("int32", [None])
        self.y_input = tf.placeholder("float", [None, self.action_dim])
        self.ISWeights = tf.placeholder("float", [None, 1])
        self.n_step_y_input = tf.placeholder(
            "float", [None, self.action_dim])  # for n-step reward
        self.isdemo = tf.placeholder("float", [None])
        self.eval_input = tf.placeholder("float", [None, self.state_dim])
        self.select_input = tf.placeholder("float", [None, self.state_dim])

        self.Q_eval
        self.Q_select

        self.loss
        self.optimize
        self.update_target_net
        self.abs_errors

        self.saver = tf.train.Saver()

        self.sess.run(tf.global_variables_initializer())

        self.save_model()
        self.restore_model()

    def add_demo_to_memory(self, demo_transitions):
        # add demo data to both demo_memory & replay_memory
        for t in demo_transitions:
            self.demo_memory.store(np.array(t, dtype=object))
            self.replay_memory.store(np.array(t, dtype=object))
            assert len(t) == 10

    # use the expert-demo-data to pretrain
    def pre_train(self):
        print('Pre-training ...')
        for i in range(self.config.PRETRAIN_STEPS):
            self.train_Q_network(pre_train=True)
            if i % 200 == 0 and i > 0:
                print('{} th step of pre-train finish ...'.format(i))
        self.time_step = 0
        print('All pre-train finish.')

    # TODO: How to add the variable created in tf.layers.dense to the customed collection?
    # def build_layers(self, state, collections, units_1, units_2, w_i, b_i, regularizer=None):
    #     with tf.variable_scope('dese1'):
    #         dense1 = tf.layers.dense(tf.contrib.layers.flatten(state), activation=tf.nn.relu, units=units_1,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     with tf.variable_scope('dens2'):
    #         dense2 = tf.layers.dense(dense1, activation=tf.nn.relu, units=units_2,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     with tf.variable_scope('dene3'):
    #         dense3 = tf.layers.dense(dense2, activation=tf.nn.relu, units=self.action_dim,
    #                                  kernel_initializer=w_i, bias_initializer=b_i,
    #                                  kernel_regularizer=regularizer, bias_regularizer=regularizer)
    #     return dense3

    def build_layers(self,
                     state,
                     c_names,
                     units_1,
                     units_2,
                     w_i,
                     b_i,
                     reg=None):
        a_d = self.action_dim
        with tf.variable_scope('l1'):
            w1 = tf.get_variable('w1', [self.state_dim, units_1],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b1 = tf.get_variable('b1', [1, units_1],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense1 = tf.nn.relu(tf.matmul(state, w1) + b1)
        with tf.variable_scope('l2'):
            w2 = tf.get_variable('w2', [units_1, units_2],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b2 = tf.get_variable('b2', [1, units_2],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense2 = tf.nn.relu(tf.matmul(dense1, w2) + b2)
        with tf.variable_scope('l3'):
            w3 = tf.get_variable('w3', [units_2, a_d],
                                 initializer=w_i,
                                 collections=c_names,
                                 regularizer=reg)
            b3 = tf.get_variable('b3', [1, a_d],
                                 initializer=b_i,
                                 collections=c_names,
                                 regularizer=reg)
            dense3 = tf.matmul(dense2, w3) + b3
        return dense3

    @lazy_property
    def Q_select(self):
        with tf.variable_scope('select_net') as scope:
            c_names = ['select_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            w_i = tf.random_uniform_initializer(-0.1, 0.1)
            b_i = tf.constant_initializer(0.1)
            reg = tf.contrib.layers.l2_regularizer(
                scale=0.2)  # Note: only parameters in select-net need L2
            return self.build_layers(self.select_input, c_names, 24, 24, w_i,
                                     b_i, reg)

    @lazy_property
    def Q_eval(self):
        with tf.variable_scope('eval_net') as scope:
            c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
            w_i = tf.random_uniform_initializer(-0.1, 0.1)
            b_i = tf.constant_initializer(0.1)
            return self.build_layers(self.eval_input, c_names, 24, 24, w_i,
                                     b_i)

    def loss_l(self, ae, a):
        return 0.0 if ae == a else 0.8

    def loss_jeq(self, Q_select):
        jeq = 0.0
        for i in range(self.config.BATCH_SIZE):
            ae = self.action_batch[i]
            max_value = float("-inf")
            for a in range(self.action_dim):
                max_value = tf.maximum(Q_select[i][a] + self.loss_l(ae, a),
                                       max_value)
            jeq += self.isdemo[i] * (max_value - Q_select[i][ae])
        return jeq

    @lazy_property
    def loss(self):
        l_dq = tf.reduce_mean(
            tf.squared_difference(self.Q_select, self.y_input))
        l_n_dq = tf.reduce_mean(
            tf.squared_difference(self.Q_select, self.n_step_y_input))
        # l_n_step_dq = self.loss_n_step_dq(self.Q_select, self.n_step_y_input)
        l_jeq = self.loss_jeq(self.Q_select)
        l_l2 = tf.reduce_sum([
            tf.reduce_mean(reg_l)
            for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        ])
        return self.ISWeights * tf.reduce_sum([
            l * λ
            for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA)
        ])

    @lazy_property
    def abs_errors(self):
        return tf.reduce_sum(tf.abs(self.y_input - self.Q_select),
                             axis=1)  # only use 1-step R to compute abs_errors

    @lazy_property
    def optimize(self):
        optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE)
        return optimizer.minimize(
            self.loss)  # only parameters in select-net is optimized here

    @lazy_property
    def update_target_net(self):
        select_params = tf.get_collection('select_net_params')
        eval_params = tf.get_collection('eval_net_params')
        return [tf.assign(e, s) for e, s in zip(eval_params, select_params)]

    def save_model(self):
        print("Model saved in : {}".format(
            self.saver.save(self.sess, self.config.MODEL_PATH)))

    def restore_model(self):
        self.saver.restore(self.sess, self.config.MODEL_PATH)
        print("Model restored.")

    def perceive(self, transition):
        self.replay_memory.store(np.array(transition))
        # epsilon->FINAL_EPSILON(min_epsilon)
        if self.replay_memory.full():
            self.epsilon = max(self.config.FINAL_EPSILON,
                               self.epsilon * self.config.EPSILIN_DECAY)

    def train_Q_network(self, pre_train=False, update=True):
        """
        :param pre_train: True means should sample from demo_buffer instead of replay_buffer
        :param update: True means the action "update_target_net" executes outside, and can be ignored in the function
        """
        if not pre_train and not self.replay_memory.full(
        ):  # sampling should be executed AFTER replay_memory filled
            return
        self.time_step += 1

        assert self.replay_memory.full() or pre_train

        actual_memory = self.demo_memory if pre_train else self.replay_memory
        tree_idxes, minibatch, ISWeights = actual_memory.sample(
            self.config.BATCH_SIZE)

        np.random.shuffle(minibatch)
        state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        next_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        demo_data = [data[5] for data in minibatch]
        n_step_reward_batch = [data[6] for data in minibatch]
        n_step_state_batch = [data[7] for data in minibatch]
        n_step_done_batch = [data[8] for data in minibatch]
        actual_n = [data[9] for data in minibatch]

        # provide for placeholder,compute first
        Q_select = self.Q_select.eval(
            feed_dict={self.select_input: next_state_batch})
        Q_eval = self.Q_eval.eval(
            feed_dict={self.eval_input: next_state_batch})
        n_step_Q_select = self.Q_select.eval(
            feed_dict={self.select_input: n_step_state_batch})
        n_step_Q_eval = self.Q_eval.eval(
            feed_dict={self.eval_input: n_step_state_batch})

        y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim))
        for i in range(self.config.BATCH_SIZE):
            # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t
            temp = self.Q_select.eval(
                feed_dict={
                    self.select_input: state_batch[i].reshape((-1,
                                                               self.state_dim))
                })[0]
            temp_0 = np.copy(temp)
            # add 1-step reward
            action = np.argmax(Q_select[i])
            temp[action_batch[i]] = reward_batch[i] + (
                1 - int(done_batch[i])) * self.config.GAMMA * Q_eval[i][action]
            y_batch[i] = temp
            # add n-step reward
            action = np.argmax(n_step_Q_select[i])
            q_n_step = (
                1 - int(n_step_done_batch[i])
            ) * self.config.GAMMA**actual_n[i] * n_step_Q_eval[i][action]
            temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step
            n_step_y_batch[i] = temp_0

        _, abs_errors = self.sess.run(
            [self.optimize, self.abs_errors],
            feed_dict={
                self.y_input: y_batch,
                self.n_step_y_input: n_step_y_batch,
                self.select_input: state_batch,
                self.action_batch: action_batch,
                self.isdemo: demo_data,
                self.ISWeights: ISWeights
            })

        self.replay_memory.batch_update(
            tree_idxes, abs_errors)  # update priorities for data in memory

        # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制
        if update and self.time_step % self.config.UPDATE_TARGET_NET == 0:
            self.sess.run(self.update_target_net)

    def egreedy_action(self, state):
        if random.random() <= self.epsilon:
            return random.randint(0, self.action_dim - 1)
        return np.argmax(
            self.Q_select.eval(feed_dict={self.select_input: [state]})[0])