def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, use_dueling=False, use_DDQN=False, use_PR=False, #Prioritized Experience Replay ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.use_dueling = use_dueling self.use_DDQN = use_DDQN self.use_PR = use_PR if not use_dueling: self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) else: self.__policy = Dueling_DQN(action_dim, device).to(device) self.__target = Dueling_DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(self.__policy.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, rlmodel: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) if rlmodel is None or rlmodel == "DQN": self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) else: print("rlmodel %s is not supported" % rlmodel) exit(-1) if restore is None: if rlmodel is None or rlmodel == "DQN": self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625 * 1.25**10, eps=1.5e-4, ) self.__scheduler = optim.lr_scheduler.StepLR(self.__optimizer, step_size=100_000, gamma=0.8) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行 self.__policy = DQN(action_dim, device).to(device) # policy network self.__target = DQN(action_dim, device).to(device) # target network if restore is None: self.__policy.apply(DQN.init_weights) # policy自定义参数初始化方式 else: self.__policy.load_state_dict( torch.load(restore)) # policy加载之前学习到的参数 self.__target.load_state_dict( self.__policy.state_dict()) # target拷贝policy的参数 self.__optimizer = optim.Adam( # 优化器采用Adam self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() # 将模型转变为evaluation(测试)模式,这样就可以排除BN和Dropout对测试的干扰
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 行动的数量 self.__device = device # 使用的设备 self.__gamma = gamma # 对未来的折扣? self.__eps_start = eps_start # 初始epsilon值 self.__eps_final = eps_final # 最终的epsilon值 self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # 实际的Q网络 self.__target = DQN(action_dim, device).to(device) # 展示固定的Q网络,作为目标 if restore is None: self.__policy.apply(DQN.init_weights) # 对权重进行初始化 else: self.__policy.load_state_dict(torch.load(restore)) # 加载已有的权重 self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, # 3 device: TorchDevice, # cuda gamma: float, # 0.99 seed: int, eps_start: float, # 1 eps_final: float, # 0.1 eps_decay: float, # 10000 restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 3 self.__device = device self.__gamma = gamma # 0.99 self.__eps_start = eps_start # 1 self.__eps_final = eps_final # 0.1 self.__eps_decay = eps_decay # 1e6 self.__eps = eps_start # 1 self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # 策略DQN self.__target = DQN(action_dim, device).to(device) # 目标DQN,减少目标计算与当前值的相关性 if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict( self.__policy.state_dict()) # 将策略网络中的权重同步到目标网络 self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) # 优化器 self.__target.eval() # 验证模式
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, #restore默认设为None ) -> None: self.__action_dim = action_dim #设置模型初始参数 self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) #可以设置运行在cpu还是gpu上 self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) #采用dqn初始权重 else: self.__policy.load_state_dict( torch.load(restore)) #否则更新为restore中的权重 self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( #优化器 self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim #动作可选维数 self.__device = device #设备 self.__gamma = gamma #γ值 self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # policy网络 self.__target = DQN(action_dim, device).to(device) # target网络 if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, isdueling: bool = False, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 动作维度 self.__device = device # 设备 self.__gamma = gamma # 衰减因子 self.__eps_start = eps_start # eps-greedy参数的初始值 self.__eps_final = eps_final # eps-greedy参数的最终值 self.__eps_decay = eps_decay # eps-greedy参数的衰减率 self.__eps = eps_start self.__r = random.Random() #随机浮点数 self.__r.seed(seed) # 随机数种子 ###修改项 if isdueling: # 使用DuelingDQN网络 self.__policy = DuelingDQN(action_dim, device).to(device) # 值函数网络 self.__target = DuelingDQN(action_dim, device).to(device) # target网络 else: self.__policy = DQN(action_dim, device).to(device) # 值函数网络 self.__target = DQN(action_dim, device).to(device) # target网络 if restore is None: if isdueling: self.__policy.apply(DuelingDQN.init_weights) # 初始化权重 else: self.__policy.apply(DQN.init_weights) # 初始化权重 ###修改项 else: self.__policy.load_state_dict(torch.load(restore)) # 将restore的数据加载到网络中 self.__target.load_state_dict(self.__policy.state_dict()) # 将policy参数赋给target,此时两个网络相同 self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, dueling: bool, restore: Optional[str] = None, stable_arg = 0.1, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__stable_arg = stable_arg if dueling: self.__policy = DuelingDQN(action_dim, device).to(device) self.__target = DuelingDQN(action_dim, device).to(device) else: self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: #if dueling: #self.__policy.Convs_load(restore) #else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval()
class Agent(object): #就是那个反弹平板模型 def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, #restore默认设为None ) -> None: self.__action_dim = action_dim #设置模型初始参数 self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) #可以设置运行在cpu还是gpu上 self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) #采用dqn初始权重 else: self.__policy.load_state_dict( torch.load(restore)) #否则更新为restore中的权重 self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( #优化器 self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() # 采用epsilon-greedy方法选择action def run(self, state: TensorStack4, training: bool = False) -> int: #运行 """run suggests an action for the given state.""" if training: #利用eps_start/eps_final这些动态衰减epsilon self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay #这是啥意思??貌似是为了调整学习率 self.__eps = max(self.__eps, self.__eps_final) #有一定的概率选择使函数值最大的action,在当前网络policy_network中得到 if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max(1).indices.item() #否则随机选择action return self.__r.randint(0, self.__action_dim - 1) #从memory中提取state action reward next来训练网络 def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" # 从replay buffer当中采样!!用于更新policy网络 state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size) #将所有变量转为张量。 #使用行为网络计算值函数 Q_j values = self.__policy(state_batch.float()).gather( 1, action_batch) # Q_j对应有state_batch和action_batch #使用目标网络计算 Q_{j+1}并计算 expected = r_{j+1} + max(a') Q_{j+1} #其中(1-done_batch)用于判断是否terminal,如果是就退化到expected = r_{j+1} #这里相当于q-learning中的更新公式的一部分。在target网络中计算Q值。 values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #根据目标函数 (Q_j - expected)^2来梯度下降 loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() #同步target网络和policy网络,即目标和行为网络 def sync(self) -> None: # """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) #保存policy network def save(self, path: str) -> None: #保存结果 """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): # 智能体的配置 def __init__( self, action_dim: int, # 3 device: TorchDevice, # cuda gamma: float, # 0.99 seed: int, eps_start: float, # 1 eps_final: float, # 0.1 eps_decay: float, # 10000 restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 3 self.__device = device self.__gamma = gamma # 0.99 self.__eps_start = eps_start # 1 self.__eps_final = eps_final # 0.1 self.__eps_decay = eps_decay # 1e6 self.__eps = eps_start # 1 self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # 策略DQN self.__target = DQN(action_dim, device).to(device) # 目标DQN,减少目标计算与当前值的相关性 if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict( self.__policy.state_dict()) # 将策略网络中的权重同步到目标网络 self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) # 优化器 self.__target.eval() # 验证模式 def run(self, state: TensorStack4, training: bool = False) -> int: """run suggests an action for the given state.""" if training: # 修改eps,逐渐降低 self.__eps -= (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: # 1-eps概率取最大值 with torch.no_grad(): return self.__policy(state).max(1).indices.item() return self.__r.randint(0, self.__action_dim - 1) # eps概率随机 def learn(self, memory: ReplayMemory, batch_size: int) -> float: # 训练 """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample( batch_size) # 在经验池中选取一组transition样本集(minibatch) values = self.__policy(state_batch.float()).gather( 1, action_batch) # DQN输出y values_next = self.__target( next_batch.float()).max(1).values.detach() # max_a(Q(S',a)) expected = (self.__gamma * values_next.unsqueeze(1)) * ( 1. - done_batch) + reward_batch # Q-Learning计算Q(S,A) loss = F.smooth_l1_loss(values, expected) # 计算误差 # 更新网络参数三部曲 self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() # 返回误差 def sync(self) -> None: # 将策略网络中的权重同步到目标网络 """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: # 保存模型 """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): # 代理 def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, isdueling: bool = False, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim # 动作维度 self.__device = device # 设备 self.__gamma = gamma # 衰减因子 self.__eps_start = eps_start # eps-greedy参数的初始值 self.__eps_final = eps_final # eps-greedy参数的最终值 self.__eps_decay = eps_decay # eps-greedy参数的衰减率 self.__eps = eps_start self.__r = random.Random() #随机浮点数 self.__r.seed(seed) # 随机数种子 ###修改项 if isdueling: # 使用DuelingDQN网络 self.__policy = DuelingDQN(action_dim, device).to(device) # 值函数网络 self.__target = DuelingDQN(action_dim, device).to(device) # target网络 else: self.__policy = DQN(action_dim, device).to(device) # 值函数网络 self.__target = DQN(action_dim, device).to(device) # target网络 if restore is None: if isdueling: self.__policy.apply(DuelingDQN.init_weights) # 初始化权重 else: self.__policy.apply(DQN.init_weights) # 初始化权重 ###修改项 else: self.__policy.load_state_dict(torch.load(restore)) # 将restore的数据加载到网络中 self.__target.load_state_dict(self.__policy.state_dict()) # 将policy参数赋给target,此时两个网络相同 self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: #e-greedy选择行动 """run suggests an action for the given state.""" if training: # 线性衰减eps self.__eps -= (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: # 产生随机数>eps,选择使Q最大的动作 with torch.no_grad(): action = self.__policy(state).max(1).indices.item() else: action = self.__r.randint(0, self.__action_dim - 1) value_this = self.__policy(state)[0][action] return action, value_this # 行动和Q值 def get_target_value(self, state): value_next = self.__target(state).max(1).indices.item() return value_next def learn(self, memory: ReplayMemory, batch_size: int, choice: int) -> float: """learn trains the value network via TD-learning.""" ##修改项 if (choice == 0): # 普通memory state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample(batch_size) else: # PERmemory state_batch, action_batch, reward_batch, next_batch, done_batch, idx_batch = \ memory.sample(batch_size) #### values = self.__policy(state_batch.float()).gather(1, action_batch) # 每一列按action_batch取元素 values_next = self.__target(next_batch.float()).max(1).values.detach() # 最大的作为下一个的value expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch # Loss=values-expected if (choice == 0): ##### loss = F.smooth_l1_loss(values, expected) else: # PERmemory loss_batch = F.smooth_l1_loss(values, expected, reduction='none') # TD error loss = torch.mean(loss_batch, dim=0) # loss.requires_grad = True memory.update(loss_batch.detach(), idx_batch) ##修改项 self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): # 把参数加紧到[-1,1],原地修改 param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行 self.__policy = DQN(action_dim, device).to(device) # policy network self.__target = DQN(action_dim, device).to(device) # target network if restore is None: self.__policy.apply(DQN.init_weights) # policy自定义参数初始化方式 else: self.__policy.load_state_dict( torch.load(restore)) # policy加载之前学习到的参数 self.__target.load_state_dict( self.__policy.state_dict()) # target拷贝policy的参数 self.__optimizer = optim.Adam( # 优化器采用Adam self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() # 将模型转变为evaluation(测试)模式,这样就可以排除BN和Dropout对测试的干扰 # epsilon-greedy def run(self, state: TensorStack4, training: bool = False) -> int: """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max(1).indices.item() return self.__r.randint(0, self.__action_dim - 1) def learn(self, memory: Experience, step: int) -> float: """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch, w, rank_e_id = \ memory.sample(step) # 随机取样 state是5帧的前4帧 next是5帧的后4帧 values = self.__policy(state_batch.float()).gather(1, action_batch) # values_next = self.__target(next_batch.float()).max(1).values.detach() # 这里还是nature dqn 没有用ddqn 虽都是双网络 values_next = self.__target(next_batch.float()).gather( 1, self.__policy(next_batch.float()).max(1).indices.unsqueeze( 1)).detach() # 改成ddqn reward_batch[action_batch == 0] += 0.1 # stable reward expected = (self.__gamma * values_next) * \ (1. - done_batch) + reward_batch # 如果done则是r(考虑t时刻done,没有t+1时刻),否则是r + gamma * max Q td_error = (expected - values).detach() memory.update_priority(rank_e_id, td_error.cpu().numpy()) values = values.mul(w) expected = expected.mul(w) loss = F.smooth_l1_loss(values, expected) # smooth l1损失 self.__optimizer.zero_grad() # 将模型的参数梯度初始化为0 loss.backward() # 计算梯度,存到__policy.parameters.grad()中 for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) # 固定所有梯度为[-1, 1] self.__optimizer.step() # 做一步最优化 return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim #动作可选维数 self.__device = device #设备 self.__gamma = gamma #γ值 self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) # policy网络 self.__target = DQN(action_dim, device).to(device) # target网络 if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: # 返回action """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max(1).indices.item() return self.__r.randint(0, self.__action_dim - 1) def learn(self, memory: ReplayMemory, batch_size: int) -> float: # 返回vlue """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size)#随机选取一个样本 # SGD优化的基本要求之一是训练数据是独立且均匀分布的 # 当Agent与环境交互时,经验元组的序列可以高度相关,所以要打乱采样 #将样本送入学习 values = self.__policy(state_batch.float()).gather( 1, action_batch) #Q表:value=Q(s,a) ##########dueling DQN修改思路:拆分Q(s,a)=V(s)+A(s,a),其中V(s)为状态s本身的价值,A(s,a)为动作a的价值########## ##########V(s)是一个标量,A(s,a)是一个向量。在相加时V(s)会自动复制到与A(s,a)维度一致########## values_next = self.__target( next_batch.float()).max(1).values.detach() #Q'表,但是具体参数不清楚 expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #当完成了(done=1),y_j=r_j;否则y_j=r_j+q()见论文算法 loss = F.smooth_l1_loss(values, expected) # 损失函数 self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): action = self.__policy(state).max(1).indices.item() else: action = self.__r.randint(0, self.__action_dim - 1) value_this = self.__policy(state)[0][action] return action, value_this #行动和Q值 #return self.__r.randint(0, self.__action_dim - 1) def get_target_value(self, state): value_next = self.__target(state).max(1).indices.item() return value_next def learn(self, memory, batch_size: int, c) -> float: ##### """learn trains the value network via TD-learning.""" if (c == 0): state_batch, action_batch, reward_batch, next_batch, done_batch = memory.sample( batch_size) else: state_batch, action_batch, reward_batch, next_batch, done_batch ,idx_batch= \ memory.sample(batch_size) #### values = self.__policy(state_batch.float()).gather( 1, action_batch) #每一列按action_batch取元素 values_next = self.__target( next_batch.float()).max(1).values.detach() #最大的作为下一个的value expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #Loss=values-expected if (c == 0): ##### loss = F.smooth_l1_loss(values, expected) else: loss_batch = F.smooth_l1_loss(values, expected, reduction='none') #TD error loss = torch.mean(loss_batch, dim=0) #loss.requires_grad = True memory.update(loss_batch.detach(), idx_batch) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): #把参数加紧到[-1,1],原地修改 param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, restore: Optional[str] = None, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False, preaction: int = 0) -> int: """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): return self.__policy(state).max(1).indices.item() if self.__r.random() > 0.15: return preaction return self.__r.randint(0, self.__action_dim - 1) def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.""" state_batch, action_batch, reward_batch, next_batch, done_batch = \ memory.sample(batch_size) values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch loss = F.smooth_l1_loss(values, expected) self.__optimizer.zero_grad() loss.backward() for param in self.__policy.parameters(): param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path)
class Agent(object): def __init__( self, action_dim: int, device: TorchDevice, gamma: float, seed: int, eps_start: float, eps_final: float, eps_decay: float, dueling: bool, restore: Optional[str] = None, stable_arg = 0.1, ) -> None: self.__action_dim = action_dim self.__device = device self.__gamma = gamma self.__eps_start = eps_start self.__eps_final = eps_final self.__eps_decay = eps_decay self.__eps = eps_start self.__r = random.Random() self.__r.seed(seed) self.__stable_arg = stable_arg if dueling: self.__policy = DuelingDQN(action_dim, device).to(device) self.__target = DuelingDQN(action_dim, device).to(device) else: self.__policy = DQN(action_dim, device).to(device) self.__target = DQN(action_dim, device).to(device) if restore is None: self.__policy.apply(DQN.init_weights) else: #if dueling: #self.__policy.Convs_load(restore) #else: self.__policy.load_state_dict(torch.load(restore)) self.__target.load_state_dict(self.__policy.state_dict()) self.__optimizer = optim.Adam( self.__policy.parameters(), lr=0.0000625, eps=1.5e-4, ) self.__target.eval() def run(self, state: TensorStack4, training: bool = False) -> int: #epsilon greedy policy """run suggests an action for the given state.""" if training: self.__eps -= \ (self.__eps_start - self.__eps_final) / self.__eps_decay self.__eps = max(self.__eps, self.__eps_final) if self.__r.random() > self.__eps: with torch.no_grad(): action = self.__policy(state).max(1).indices.item() else: action = self.__r.randint(0, self.__action_dim - 1) value_this = self.__policy(state)[0][action] return action, value_this def get_target_value(self, state): value_next = self.__target(state).max(1).indices.item() return value_next def learn(self, memory: ReplayMemory, batch_size: int) -> float: """learn trains the value network via TD-learning.应该是Q network """ state_batch, action_batch, reward_batch, \ next_batch, done_batch, idx_batch = memory.sample(batch_size) values = self.__policy(state_batch.float()).gather(1, action_batch) values_next = self.__target(next_batch.float()).max(1).values.detach() expected = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done_batch) + reward_batch #TD target loss_batch = F.smooth_l1_loss(values, expected, reduce=False) #TD error loss = torch.mean(loss_batch, dim=0) loss.requires_grad = True memory.update(loss_batch.detach(), idx_batch) self.__optimizer.zero_grad() loss.backward() #backward for param in self.__policy.parameters(): if param.grad is not None:#grad clamp to (-1,1) param.grad.data.clamp_(-1, 1) self.__optimizer.step() #update return loss.item() def stable_learn(self, folded_state, action, reward, done): '''learn stable and q_value''' state = folded_state[:4] state_next = folded_state[1:] value_next = agent.get_target_value(state_next) value_now = self.__policy(state.float()).gather(1, action) td_target = (self.__gamma * values_next.unsqueeze(1)) * \ (1. - done) + reward td_error = F.smooth_l1_loss(value_now, td_target) stable_loss = torch.zeros(1).float() for i in [1, 2]: for j in [i-1, i+1]: stable_loss += 1*(action[j]*action[i]==2) stable_loss -= 1*(action[1]*action[2]==2) stable_loss.requires_grad = True loss = td_error + self.stable_arg*stable_loss self.__optimizer.zero_grad() loss.backward() #backward for param in self.__policy.parameters(): if param.grad is not None:#grad clamp to (-1,1) param.grad.data.clamp_(-1, 1) self.__optimizer.step() return loss.item() def sync(self) -> None: """sync synchronizes the weights from the policy network to the target network.""" self.__target.load_state_dict(self.__policy.state_dict()) def save(self, path: str) -> None: """save saves the state dict of the policy network.""" torch.save(self.__policy.state_dict(), path) def load(self, path: str) -> None: """save saves the state dict of the policy network.""" self.__policy.load_state_dict(path) self.__target.load_state_dict(path)