Пример #1
0
class Agent(object):
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)
        self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self,
            state: TensorStack4,
            training: bool = False,
            preaction: int = 0) -> int:
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        if self.__r.random() > 0.15:
            return preaction
        return self.__r.randint(0, self.__action_dim - 1)

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch = \
            memory.sample(batch_size)

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch
        loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Пример #2
0
class Agent(object):  #就是那个反弹平板模型
    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,
            eps_start: float,
            eps_final: float,
            eps_decay: float,
            restore: Optional[str] = None,  #restore默认设为None
    ) -> None:
        self.__action_dim = action_dim  #设置模型初始参数
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)  #可以设置运行在cpu还是gpu上
        self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)  #采用dqn初始权重
        else:
            self.__policy.load_state_dict(
                torch.load(restore))  #否则更新为restore中的权重
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(  #优化器
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    # 采用epsilon-greedy方法选择action
    def run(self, state: TensorStack4, training: bool = False) -> int:  #运行
        """run suggests an action for the given state."""
        if training:
            #利用eps_start/eps_final这些动态衰减epsilon
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay   #这是啥意思??貌似是为了调整学习率
            self.__eps = max(self.__eps, self.__eps_final)

        #有一定的概率选择使函数值最大的action,在当前网络policy_network中得到
        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        #否则随机选择action
        return self.__r.randint(0, self.__action_dim - 1)

    #从memory中提取state action reward next来训练网络
    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning."""
        # 从replay buffer当中采样!!用于更新policy网络
        state_batch, action_batch, reward_batch, next_batch, done_batch = \
            memory.sample(batch_size)   #将所有变量转为张量。
        #使用行为网络计算值函数 Q_j
        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  # Q_j对应有state_batch和action_batch

        #使用目标网络计算 Q_{j+1}并计算 expected = r_{j+1} + max(a') Q_{j+1}
        #其中(1-done_batch)用于判断是否terminal,如果是就退化到expected = r_{j+1}
        #这里相当于q-learning中的更新公式的一部分。在target网络中计算Q值。
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch

        #根据目标函数 (Q_j - expected)^2来梯度下降
        loss = F.smooth_l1_loss(values, expected)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    #同步target网络和policy网络,即目标和行为网络
    def sync(self) -> None:  #
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    #保存policy network
    def save(self, path: str) -> None:  #保存结果
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
class Agent(object):
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)
        self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self, state: TensorStack4, training: bool = False) -> int:
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                action = self.__policy(state).max(1).indices.item()
        else:
            action = self.__r.randint(0, self.__action_dim - 1)
        value_this = self.__policy(state)[0][action]
        return action, value_this  #行动和Q值
        #return self.__r.randint(0, self.__action_dim - 1)

    def get_target_value(self, state):
        value_next = self.__target(state).max(1).indices.item()
        return value_next

    def learn(self, memory, batch_size: int, c) -> float:  #####
        """learn trains the value network via TD-learning."""
        if (c == 0):
            state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(
                batch_size)
        else:
            state_batch, action_batch, reward_batch, next_batch, done_batch ,idx_batch= \
                memory.sample(batch_size)                                                    ####

        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  #每一列按action_batch取元素
        values_next = self.__target(
            next_batch.float()).max(1).values.detach()  #最大的作为下一个的value
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch
        #Loss=values-expected
        if (c == 0):  #####
            loss = F.smooth_l1_loss(values, expected)
        else:
            loss_batch = F.smooth_l1_loss(values, expected,
                                          reduction='none')  #TD error
            loss = torch.mean(loss_batch, dim=0)
            #loss.requires_grad = True
            memory.update(loss_batch.detach(), idx_batch)

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():  #把参数加紧到[-1,1],原地修改
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Пример #4
0
class Agent(object):  # 智能体的配置
    def __init__(
        self,
        action_dim: int,  # 3
        device: TorchDevice,  # cuda
        gamma: float,  # 0.99
        seed: int,
        eps_start: float,  # 1
        eps_final: float,  # 0.1
        eps_decay: float,  # 10000
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim  # 3
        self.__device = device
        self.__gamma = gamma  # 0.99

        self.__eps_start = eps_start  # 1
        self.__eps_final = eps_final  # 0.1
        self.__eps_decay = eps_decay  # 1e6

        self.__eps = eps_start  # 1
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)  # 策略DQN
        self.__target = DQN(action_dim,
                            device).to(device)  # 目标DQN,减少目标计算与当前值的相关性

        if restore is None: self.__policy.apply(DQN.init_weights)
        else: self.__policy.load_state_dict(torch.load(restore))

        self.__target.load_state_dict(
            self.__policy.state_dict())  # 将策略网络中的权重同步到目标网络
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )  # 优化器
        self.__target.eval()  # 验证模式

    def run(self, state: TensorStack4, training: bool = False) -> int:
        """run suggests an action for the given state."""
        if training:  # 修改eps,逐渐降低
            self.__eps -= (self.__eps_start -
                           self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:  # 1-eps概率取最大值
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        return self.__r.randint(0, self.__action_dim - 1)  # eps概率随机

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:  # 训练
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(
            batch_size)  # 在经验池中选取一组transition样本集(minibatch)
        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  # DQN输出y
        values_next = self.__target(
            next_batch.float()).max(1).values.detach()  # max_a(Q(S',a))
        expected = (self.__gamma * values_next.unsqueeze(1)) * (
            1. - done_batch) + reward_batch  # Q-Learning计算Q(S,A)
        loss = F.smooth_l1_loss(values, expected)  # 计算误差
        # 更新网络参数三部曲
        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()  # 返回误差

    def sync(self) -> None:  # 将策略网络中的权重同步到目标网络
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:  # 保存模型
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
class Agent(object):    # 代理

    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            isdueling: bool = False,
            restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim  # 动作维度
        self.__device = device  # 设备
        self.__gamma = gamma    # 衰减因子

        self.__eps_start = eps_start    # eps-greedy参数的初始值
        self.__eps_final = eps_final    # eps-greedy参数的最终值
        self.__eps_decay = eps_decay    # eps-greedy参数的衰减率

        self.__eps = eps_start
        self.__r = random.Random()                                 #随机浮点数
        self.__r.seed(seed)     # 随机数种子

        ###修改项
        if isdueling:   # 使用DuelingDQN网络
            self.__policy = DuelingDQN(action_dim, device).to(device)  # 值函数网络
            self.__target = DuelingDQN(action_dim, device).to(device)  # target网络
        else:
            self.__policy = DQN(action_dim, device).to(device)  # 值函数网络
            self.__target = DQN(action_dim, device).to(device)  # target网络

        if restore is None:
            if isdueling:
                self.__policy.apply(DuelingDQN.init_weights)  # 初始化权重
            else:
                self.__policy.apply(DQN.init_weights)  # 初始化权重
        ###修改项
        else:
            self.__policy.load_state_dict(torch.load(restore))    # 将restore的数据加载到网络中

        self.__target.load_state_dict(self.__policy.state_dict()) # 将policy参数赋给target,此时两个网络相同
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self, state: TensorStack4, training: bool = False) -> int:              #e-greedy选择行动
        """run suggests an action for the given state."""
        if training:        # 线性衰减eps
            self.__eps -= (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:      # 产生随机数>eps,选择使Q最大的动作
            with torch.no_grad():
                action = self.__policy(state).max(1).indices.item()
        else:
            action = self.__r.randint(0, self.__action_dim - 1)
        value_this = self.__policy(state)[0][action]
        return action, value_this  # 行动和Q值

    def get_target_value(self, state):
        value_next = self.__target(state).max(1).indices.item()
        return value_next

    def learn(self, memory: ReplayMemory, batch_size: int, choice: int) -> float:
        """learn trains the value network via TD-learning."""

        ##修改项
        if (choice == 0):   # 普通memory
            state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(batch_size)
        else:       # PERmemory
            state_batch, action_batch, reward_batch, next_batch, done_batch, idx_batch = \
                memory.sample(batch_size)  ####

        values = self.__policy(state_batch.float()).gather(1, action_batch)  # 每一列按action_batch取元素
        values_next = self.__target(next_batch.float()).max(1).values.detach()  # 最大的作为下一个的value
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
                   (1. - done_batch) + reward_batch
        # Loss=values-expected
        if (choice == 0):  #####
            loss = F.smooth_l1_loss(values, expected)
        else:       # PERmemory
            loss_batch = F.smooth_l1_loss(values, expected, reduction='none')  # TD error
            loss = torch.mean(loss_batch, dim=0)
            # loss.requires_grad = True
            memory.update(loss_batch.detach(), idx_batch)
        ##修改项

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():  # 把参数加紧到[-1,1],原地修改
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Пример #6
0
class Agent(object):
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行
        self.__policy = DQN(action_dim, device).to(device)  # policy network
        self.__target = DQN(action_dim, device).to(device)  # target network
        if restore is None:
            self.__policy.apply(DQN.init_weights)  # policy自定义参数初始化方式
        else:
            self.__policy.load_state_dict(
                torch.load(restore))  # policy加载之前学习到的参数
        self.__target.load_state_dict(
            self.__policy.state_dict())  # target拷贝policy的参数
        self.__optimizer = optim.Adam(  # 优化器采用Adam
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()  # 将模型转变为evaluation(测试)模式,这样就可以排除BN和Dropout对测试的干扰

    # epsilon-greedy
    def run(self, state: TensorStack4, training: bool = False) -> int:
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        return self.__r.randint(0, self.__action_dim - 1)

    def learn(self, memory: Experience, step: int) -> float:
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch, w, rank_e_id = \
            memory.sample(step)  # 随机取样 state是5帧的前4帧 next是5帧的后4帧

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        # values_next = self.__target(next_batch.float()).max(1).values.detach()  # 这里还是nature dqn 没有用ddqn 虽都是双网络
        values_next = self.__target(next_batch.float()).gather(
            1,
            self.__policy(next_batch.float()).max(1).indices.unsqueeze(
                1)).detach()  # 改成ddqn
        reward_batch[action_batch == 0] += 0.1  # stable reward
        expected = (self.__gamma * values_next) * \
                   (1. - done_batch) + reward_batch  # 如果done则是r(考虑t时刻done,没有t+1时刻),否则是r + gamma * max Q

        td_error = (expected - values).detach()
        memory.update_priority(rank_e_id, td_error.cpu().numpy())

        values = values.mul(w)
        expected = expected.mul(w)
        loss = F.smooth_l1_loss(values, expected)  # smooth l1损失

        self.__optimizer.zero_grad()  # 将模型的参数梯度初始化为0
        loss.backward()  # 计算梯度,存到__policy.parameters.grad()中
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)  # 固定所有梯度为[-1, 1]
        self.__optimizer.step()  # 做一步最优化

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Пример #7
0
class Agent(object):
    def __init__(
        self,
        action_dim: int,
        device: TorchDevice,
        gamma: float,
        seed: int,
        eps_start: float,
        eps_final: float,
        eps_decay: float,
        restore: Optional[str] = None,
    ) -> None:
        self.__action_dim = action_dim  #动作可选维数
        self.__device = device  #设备
        self.__gamma = gamma  #γ值

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)

        self.__policy = DQN(action_dim, device).to(device)  # policy网络
        self.__target = DQN(action_dim, device).to(device)  # target网络

        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self,
            state: TensorStack4,
            training: bool = False) -> int:  # 返回action
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                return self.__policy(state).max(1).indices.item()
        return self.__r.randint(0, self.__action_dim - 1)

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:  # 返回vlue
        """learn trains the value network via TD-learning."""
        state_batch, action_batch, reward_batch, next_batch, done_batch = \
            memory.sample(batch_size)#随机选取一个样本
        # SGD优化的基本要求之一是训练数据是独立且均匀分布的
        # 当Agent与环境交互时,经验元组的序列可以高度相关,所以要打乱采样
        #将样本送入学习
        values = self.__policy(state_batch.float()).gather(
            1, action_batch)  #Q表:value=Q(s,a)

        ##########dueling DQN修改思路:拆分Q(s,a)=V(s)+A(s,a),其中V(s)为状态s本身的价值,A(s,a)为动作a的价值##########
        ##########V(s)是一个标量,A(s,a)是一个向量。在相加时V(s)会自动复制到与A(s,a)维度一致##########

        values_next = self.__target(
            next_batch.float()).max(1).values.detach()  #Q'表,但是具体参数不清楚
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch         #当完成了(done=1),y_j=r_j;否则y_j=r_j+q()见论文算法
        loss = F.smooth_l1_loss(values, expected)  # 损失函数

        self.__optimizer.zero_grad()
        loss.backward()
        for param in self.__policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()

        return loss.item()

    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)
Пример #8
0
class Agent(object):

    def __init__(
            self,
            action_dim: int,
            device: TorchDevice,
            gamma: float,
            seed: int,

            eps_start: float,
            eps_final: float,
            eps_decay: float,

            dueling: bool,
            restore: Optional[str] = None,
            stable_arg = 0.1,
    ) -> None:
        self.__action_dim = action_dim
        self.__device = device
        self.__gamma = gamma

        self.__eps_start = eps_start
        self.__eps_final = eps_final
        self.__eps_decay = eps_decay

        self.__eps = eps_start
        self.__r = random.Random()
        self.__r.seed(seed)
        
        self.__stable_arg = stable_arg

        if dueling:
            self.__policy = DuelingDQN(action_dim, device).to(device)
            self.__target = DuelingDQN(action_dim, device).to(device)
        else:
            self.__policy = DQN(action_dim, device).to(device)
            self.__target = DQN(action_dim, device).to(device)
        if restore is None:
            self.__policy.apply(DQN.init_weights)
        else:
            #if dueling:
            #self.__policy.Convs_load(restore)
            #else:
            self.__policy.load_state_dict(torch.load(restore))
        self.__target.load_state_dict(self.__policy.state_dict())
        self.__optimizer = optim.Adam(
            self.__policy.parameters(),
            lr=0.0000625,
            eps=1.5e-4,
        )
        self.__target.eval()

    def run(self, state: TensorStack4, training: bool = False) -> int:              #epsilon greedy policy
        """run suggests an action for the given state."""
        if training:
            self.__eps -= \
                (self.__eps_start - self.__eps_final) / self.__eps_decay
            self.__eps = max(self.__eps, self.__eps_final)

        if self.__r.random() > self.__eps:
            with torch.no_grad():
                action = self.__policy(state).max(1).indices.item()
        else: 
            action =  self.__r.randint(0, self.__action_dim - 1)
        value_this = self.__policy(state)[0][action]
        return action, value_this

    def get_target_value(self, state):
        value_next = self.__target(state).max(1).indices.item()
        return value_next

    def learn(self, memory: ReplayMemory, batch_size: int) -> float:
        """learn trains the value network via TD-learning.应该是Q network """
        state_batch, action_batch, reward_batch, \
            next_batch, done_batch, idx_batch = memory.sample(batch_size)

        values = self.__policy(state_batch.float()).gather(1, action_batch)
        values_next = self.__target(next_batch.float()).max(1).values.detach()
        expected = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done_batch) + reward_batch                                        #TD target
        loss_batch = F.smooth_l1_loss(values, expected, reduce=False)               #TD error
        loss = torch.mean(loss_batch, dim=0)
        loss.requires_grad = True
        memory.update(loss_batch.detach(), idx_batch)
        
        self.__optimizer.zero_grad()
        loss.backward()                                                             #backward
        for param in self.__policy.parameters():
            if param.grad is not None:#grad clamp to (-1,1)
                param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()                                                     #update
        
        return loss.item()

    def stable_learn(self, folded_state, action, reward, done):
        '''learn stable and q_value'''
        state = folded_state[:4]
        state_next = folded_state[1:]
        value_next = agent.get_target_value(state_next)
        value_now = self.__policy(state.float()).gather(1, action)
        td_target = (self.__gamma * values_next.unsqueeze(1)) * \
            (1. - done) + reward 
        td_error = F.smooth_l1_loss(value_now, td_target)
        stable_loss = torch.zeros(1).float()
        for i in [1, 2]:
            for j in [i-1, i+1]:
                stable_loss += 1*(action[j]*action[i]==2)
        stable_loss -= 1*(action[1]*action[2]==2)
        stable_loss.requires_grad = True
        loss = td_error + self.stable_arg*stable_loss
        self.__optimizer.zero_grad()
        loss.backward()                                                             #backward
        for param in self.__policy.parameters():
            if param.grad is not None:#grad clamp to (-1,1)
                param.grad.data.clamp_(-1, 1)
        self.__optimizer.step()  
        return loss.item()
    
    def sync(self) -> None:
        """sync synchronizes the weights from the policy network to the target
        network."""
        self.__target.load_state_dict(self.__policy.state_dict())

    def save(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        torch.save(self.__policy.state_dict(), path)

    def load(self, path: str) -> None:
        """save saves the state dict of the policy network."""
        self.__policy.load_state_dict(path)
        self.__target.load_state_dict(path)