class DDPG(): """ Individual Agent. """ def __init__(self, state_size, action_size, params): """ Build model, random process and intilize it. """ torch.manual_seed(params['SEED']) self.random_process = OUNoise(action_size, params) self.local_actor = Actor(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) self.target_actor = Actor(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) # Initialize target networks weights with local networks self.hard_copy(self.local_actor, self.target_actor) # Optimizer for local actor networks self.actor_optimizer = torch.optim.Adam(self.local_actor.parameters(), params['LR_ACTOR']) self.local_critic = Critic(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) self.target_critic = Critic(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) # Initialize target networks weights with local networks self.hard_copy(self.local_critic, self.target_critic) # Optimizer for local critic networks self.critic_optimizer = torch.optim.Adam( self.local_critic.parameters(), params['LR_CRITIC']) def reset_noise(self): """ Reset the noise state every episode """ self.random_process.reset() def hard_copy(self, local, target): """ hard copy the weights of the local network to the target network """ for local_param, target_param in zip(local.parameters(), target.parameters()): target_param.data.copy_(local_param.data) def soft_copy(self, tau): """ soft update target network 𝜃_target = 𝜏*𝜃_local + (1 - 𝜏)*𝜃_target """ for local_param, target_param in zip(self.local_actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) for local_param, target_param in zip(self.local_critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
class IAC(): def __init__(self, action_dim, state_dim): self.actor = Actor(action_dim, state_dim) self.critic = Critic(state_dim) self.action_dim = action_dim self.state_dim = state_dim self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=1000, gamma=0.9, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=1000, gamma=0.9, last_epoch=-1) } # self.act_prob # self.act_log_prob def choose_action(self, s): s = torch.Tensor(s).squeeze(0) self.act_prob = self.actor(s) + 0.00001 m = torch.distributions.Categorical(self.act_prob) # self.act_log_prob = m.log_prob(m.sample()) temp = m.sample() return temp def cal_tderr(self, s, r, s_): s = torch.Tensor(s).unsqueeze(0) s_ = torch.Tensor(s_).unsqueeze(0) v_ = self.critic(s_).detach() v = self.critic(s) return r + 0.9 * v_ - v def learnCritic(self, s, r, s_): td_err = self.cal_tderr(s, r, s_) loss = torch.mul(td_err, td_err) #torch.square(td_err) self.optimizerC.zero_grad() loss.backward() self.optimizerC.step() self.lr_scheduler["optC"].step() def learnActor(self, s, r, s_, a): td_err = self.cal_tderr(s, r, s_) m = torch.log(self.act_prob[a]) temp = m * td_err.detach() loss = -torch.mean(temp) self.optimizerA.zero_grad() loss.backward() self.optimizerA.step() self.lr_scheduler["optA"].step() def update(self, s, r, s_, a): self.learnCritic(s, r, s_) self.learnActor(s, r, s_, a)
class DDPGAgent: def __init__(self, action_size, action_type, state_size, hidden_in_size, hidden_out_size, num_atoms, lr_actor, lr_critic, l2_decay, noise_type, OU_mu, OU_theta, OU_sigma): super(DDPGAgent, self).__init__() # creating actors, critics and targets using the specified layer sizes. Note for the critics we assume 2 agents self.actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.target_actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.target_critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.noise_type = noise_type self.action_type = action_type if noise_type == 'OUNoise': # if we're using OUNoise it needs to be initialised as it is an autocorrelated process self.noise = OUNoise(action_size, OU_mu, OU_theta, OU_sigma) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # initialize optimisers using specigied learning rates self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=l2_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=l2_decay) def act(self, obs, noise_scale=0.0): obs = obs.to(device) action = self.actor(obs) if noise_scale == 0.0: # if no noise then just return action as is pass elif self.noise_type == 'OUNoise': noise = 1.5 * self.noise.noise() action += noise_scale * (noise - action) action = torch.clamp(action, -1, 1) elif self.noise_type == 'BetaNoise': action = BetaNoise(action, noise_scale) elif self.noise_type == 'GaussNoise': action = GaussNoise(action, noise_scale) elif self.noise_type == 'WeightedNoise': action = WeightedNoise(action, noise_scale, self.action_type) return action # target actor is only used for updates not exploration and so has no noise def target_act(self, obs): obs = obs.to(device) action = self.target_actor(obs) return action
class Predator: def __init__(self, s_dim, a_dim, num_agent, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.num_agent = num_agent self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, num_agent) self.critic_target = Critic(s_dim, a_dim, num_agent) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.a_loss = 0 self.c_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) def get_batches(self): experiences = random.sample(self.replay_buffer, self.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def random_action(self): action = np.random.uniform(low=-1., high=1., size=(self.num_agent, self.a_dim)) return action def reset(self): self.random_process.reset_states()
class PPO(nn.Module): def __init__(self,state_dim,action_dim,hidden_dim = 64, learning_rate = 3e-4,entropy_coef = 1e-2,critic_coef =0.5, gamma = 0.99, lmbda =0.95,eps_clip= 0.2,K_epoch = 10,minibatch_size = 64,device = 'cpu'): super(PPO,self).__init__() self.entropy_coef = entropy_coef self.critic_coef = critic_coef self.gamma = gamma self.lmbda = lmbda self.eps_clip = eps_clip self.K_epoch = K_epoch self.minibatch_size = minibatch_size self.max_grad_norm = 0.5 self.data = Rollouts() self.actor = Actor(state_dim,action_dim,hidden_dim) self.critic = Critic(state_dim,hidden_dim) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) self.device = device def pi(self,x): mu,sigma = self.actor(x) return mu,sigma def v(self,x): return self.critic(x) def put_data(self,transition): self.data.append(transition) def train_net(self,n_epi,writer): s_, a_, r_, s_prime_, done_mask_, old_log_prob_ = self.data.make_batch(self.device) old_value_ = self.v(s_).detach() td_target = r_ + self.gamma * self.v(s_prime_) * done_mask_ delta = td_target - old_value_ delta = delta.detach().cpu().numpy() advantage_lst = [] advantage = 0.0 for idx in reversed(range(len(delta))): if done_mask_[idx] == 0: advantage = 0.0 advantage = self.gamma * self.lmbda * advantage + delta[idx][0] advantage_lst.append([advantage]) advantage_lst.reverse() advantage_ = torch.tensor(advantage_lst, dtype=torch.float).to(self.device) returns_ = advantage_ + old_value_ advantage_ = (advantage_ - advantage_.mean())/(advantage_.std()+1e-3) for i in range(self.K_epoch): for s,a,r,s_prime,done_mask,old_log_prob,advantage,return_,old_value in self.data.choose_mini_batch(\ self.minibatch_size ,s_, a_, r_, s_prime_, done_mask_, old_log_prob_,advantage_,returns_,old_value_): curr_mu,curr_sigma = self.pi(s) value = self.v(s).float() curr_dist = torch.distributions.Normal(curr_mu,curr_sigma) entropy = curr_dist.entropy() * self.entropy_coef curr_log_prob = curr_dist.log_prob(a).sum(1,keepdim = True) ratio = torch.exp(curr_log_prob - old_log_prob.detach()) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1-self.eps_clip, 1+self.eps_clip) * advantage actor_loss = (-torch.min(surr1, surr2) - entropy).mean() old_value_clipped = old_value + (value - old_value).clamp(-self.eps_clip,self.eps_clip) value_loss = (value - return_.detach().float()).pow(2) value_loss_clipped = (old_value_clipped - return_.detach().float()).pow(2) critic_loss = 0.5 * self.critic_coef * torch.max(value_loss,value_loss_clipped).mean() if writer != None: writer.add_scalar("loss/actor_loss", actor_loss.item(), n_epi) writer.add_scalar("loss/critic_loss", critic_loss.item(), n_epi) self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, aid=0, num_agents=2, seed=1234): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed=seed).to(device) self.actor_target = Actor(state_size, action_size, seed=seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents=num_agents, seed=seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents=num_agents, seed=seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed=seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def update_targets(self, tau): soft_update(self.critic_local, self.critic_target, tau) soft_update(self.actor_local, self.actor_target, tau) def update_critic(self, states, actions, next_states, next_actions, rewards, dones): Q_targets_next = self.critic_target(next_states, next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def update_actor(self, states, actions): actor_loss = -self.critic_local(states, actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step()
class MADDPG(object): device = 'cuda' if torch.cuda.is_available() else 'cpu' def __init__(self, state_dim, action_dim, max_action, agent_n, logger): # 存在于 GPU 的神经网络 self.actor = Actor(state_dim, action_dim, max_action).to(self.device) # origin_network self.actor_target = Actor(state_dim, action_dim, max_action).to(self.device) # target_network self.actor_target.load_state_dict(self.actor.state_dict( )) # initiate actor_target with actor's parameters # pytorch 中的 tensor 默认requires_grad 属性为false,即不参与梯度传播运算,特别地,opimizer中模型参数是会参与梯度优化的 self.actor_optimizer = optim.Adam( self.actor.parameters(), pdata.LEARNING_RATE) # 以pdata.LEARNING_RATE指定学习率优化actor中的参数 self.critic = CriticCentral(agent_n).to(self.device) self.critic_target = CriticCentral(agent_n).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), pdata.LEARNING_RATE) # self.replay_buffer 取消:每个Agent不再有独立的经验池 self.writer = SummaryWriter(pdata.DIRECTORY + 'runs') self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.logger = logger def select_action(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) # numpy.ndarray.flatten(): 返回一个 ndarray对象的copy,并且将该ndarray压缩成一维数组 action = self.actor(state).cpu().data.numpy().flatten() return action # 未归一化的 def select_target_actions(self, states): # state 的输入必须是归一化的[[],[]] np.ndarray united_state = torch.FloatTensor(states).to(self.device) action = self.actor_target(united_state).cpu().data.numpy() return action # 未归一化的 def select_current_actions(self, states): united_state = torch.FloatTensor(states).to(self.device) action = self.actor(united_state).cpu().data.numpy() return action # 未归一化的 # update the parameters in actor network and critic network def update(self, central_replay_buffer_ma, agent_list, agent_idx): critic_loss_list = [] actor_performance_list = [] # Sample replay buffer: [(united_normalized_states, united_normalized_next_states, united__normalized_action, [reward_1, ...], done), (...)] x, y, u, r, d = central_replay_buffer_ma.sample( pdata.BATCH_SIZE ) # 随机获取 batch_size 个五元组样本(sample random minibatch) # TODO: 从这里以下要改造成MADDPG的范式 next_actions = [ agent_list[i].select_target_actions( y[:, i * pdata.STATE_DIMENSION:i * pdata.STATE_DIMENSION + pdata.STATE_DIMENSION]) for i in range(len(agent_list)) ] next_actions = np.concatenate(next_actions, axis=1) united_next_action_batch = torch.FloatTensor(next_actions).to( self.device) united_state_batch = torch.FloatTensor(x).to(self.device) united_action_batch = torch.FloatTensor(u).to(self.device) united_next_state_batch = torch.FloatTensor(y).to(self.device) done = torch.FloatTensor(d).to(self.device) reward_batch = torch.FloatTensor(r[:, agent_idx]) # torch.Size([64]) reward_batch = reward_batch[:, None].to(self.device) # torch.Size([64,1]) target_Q = self.critic_target( united_next_state_batch, united_next_action_batch) # torch.Size([64,1]) target_Q = reward_batch + ( (1 - done) * pdata.GAMMA * target_Q).detach() # batch_size个维度 current_Q = self.critic(united_state_batch, united_action_batch) critic_loss = F.mse_loss(current_Q, target_Q) self.writer.add_scalar('critic_loss', critic_loss, global_step=self.num_critic_update_iteration) # self.logger.write_to_log('critic_loss:{loss}'.format(loss=critic_loss)) # self.logger.add_to_critic_buffer(critic_loss.item()) critic_loss_list.append(critic_loss.item()) self.critic_optimizer.zero_grad() # zeros the gradient buffer critic_loss.backward() # back propagation on a dynamic graph self.critic_optimizer.step() current_actions = [ agent_list[i].select_current_actions( x[:, i * pdata.STATE_DIMENSION:i * pdata.STATE_DIMENSION + pdata.STATE_DIMENSION]) for i in range(len(agent_list)) ] current_actions_batch = torch.FloatTensor( np.concatenate(current_actions, axis=1)).to(self.device) actor_loss = -self.critic(united_state_batch, current_actions_batch).mean() self.writer.add_scalar('actor_performance', actor_loss, global_step=self.num_actor_update_iteration) # self.logger.write_to_log('actor_loss:{loss}'.format(loss=actor_loss)) # self.logger.add_to_actor_buffer(actor_loss.item()) actor_performance_list.append(actor_loss.item()) # Optimize the actor self.actor_optimizer.zero_grad( ) # Clears the gradients of all optimized torch.Tensor actor_loss.backward() self.actor_optimizer.step() # perform a single optimization step # 这里是两个 target网络的 soft update for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(pdata.TAU * param.data + (1 - pdata.TAU) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(pdata.TAU * param.data + (1 - pdata.TAU) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 actor_performance = np.mean(np.array(actor_performance_list)).item() self.logger.add_to_actor_buffer(actor_performance) critic_loss = np.mean(critic_loss_list).item() self.logger.add_to_critic_buffer(critic_loss) def save(self, mark_str): torch.save(self.actor.state_dict(), pdata.DIRECTORY + mark_str + '_actor_ma.pth') torch.save(self.critic.state_dict(), pdata.DIRECTORY + mark_str + '_critic_ma.pth') def load(self, mark_str): file_actor = pdata.DIRECTORY + mark_str + '_actor_ma.pth' file_critic = pdata.DIRECTORY + mark_str + '_critic_ma.pth' if os.path.exists(file_actor) and os.path.exists(file_critic): self.actor.load_state_dict(torch.load(file_actor)) self.critic.load_state_dict(torch.load(file_critic))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise = OUNoise((action_size), random_seed) # Make sure target is initialized with the same weight as the source (found on slack to make big difference) self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(states, actions, rewards, next_states, dones) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): """Noise reset.""" self.noise.reset() self.i_step = 0 def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, source_param in zip(target.parameters(), source.parameters()): target_param.data.copy_(source_param.data)
class BiCNet(): def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list() def choose_action(self, obs, noisy=True): obs = torch.Tensor([obs]).to(self.device) action = self.policy(obs).cpu().detach().numpy()[0] self.action_log.append(action) if noisy: for agent_idx in range(self.n_agents): pass # action[agent_idx] += self.epsilon * self.random_process.sample() self.epsilon -= self.depsilon self.epsilon = max(self.epsilon, 0.001) np.clip(action, -1., 1.) return action def reset(self): self.random_process.reset_states() self.action_log.clear() def prep_train(self): self.policy.train() self.critic.train() self.policy_target.train() self.critic_target.train() def prep_eval(self): self.policy.eval() self.critic.eval() self.policy_target.eval() self.critic_target.eval() def random_action(self): return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2)) def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = torch.Tensor(state_batches).to(self.device) action_batches = torch.Tensor(action_batches).to(self.device) reward_batches = torch.Tensor(reward_batches).reshape( self.config.batch_size, self.n_agents, 1).to(self.device) next_state_batches = torch.Tensor(next_state_batches).to(self.device) done_batches = torch.Tensor( (done_batches == False) * 1).reshape(self.config.batch_size, self.n_agents, 1).to(self.device) target_next_actions = self.policy_target.forward(next_state_batches) target_next_q = self.critic_target.forward(next_state_batches, target_next_actions) main_q = self.critic(state_batches, action_batches) ''' How to concat each agent's Q value? ''' #target_next_q = target_next_q #main_q = main_q.mean(dim=1) ''' Reward Norm ''' # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024 # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines.detach()) loss_critic.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() # Actor Loss self.policy.zero_grad() clear_action_batches = self.policy.forward(state_batches) loss_actor = -self.critic.forward(state_batches, clear_action_batches).mean() loss_actor += (clear_action_batches**2).mean() * 1e-3 loss_actor.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5) self.policy_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.policy, self.policy_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def get_loss(self): return self.c_loss, self.a_loss def get_action_std(self): return np.array(self.action_log).std(axis=-1).mean()
class Agent: def __init__(self, replay_buffer, noise, state_dim, action_dim, seed, fc1_units = 256, fc2_units = 128, device="cpu", lr_actor=1e-4, lr_critic=1e-3, batch_size=128, discount=0.99, tau=1e-3): torch.manual_seed(seed) self.actor_local = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_local = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(params=self.critic_local.parameters(), lr=lr_critic) self.actor_target = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_target = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.buffer = replay_buffer self.noise = noise self.device = device self.batch_size = batch_size self.discount = discount self.tau = tau Agent.hard_update(model_local=self.actor_local, model_target=self.actor_target) Agent.hard_update(model_local=self.critic_local, model_target=self.critic_target) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.buffer.size() >= self.batch_size: experiences = self.buffer.sample(self.batch_size) self.learn(self.to_tensor(experiences)) def to_tensor(self, experiences): states, actions, rewards, next_states, dones = experiences states = torch.from_numpy(states).float().to(self.device) actions = torch.from_numpy(actions).float().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) next_states = torch.from_numpy(next_states).float().to(self.device) dones = torch.from_numpy(dones.astype(np.uint8)).float().to(self.device) return states, actions, rewards, next_states, dones def act(self, states, add_noise=True): states = torch.from_numpy(states).float().to(device=self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Update critic next_actions = self.actor_target(next_states) q_target_next = self.critic_target(next_states, next_actions) q_target = rewards + self.discount * q_target_next * (1.0 - dones) q_local = self.critic_local(states, actions) critic_loss = F.mse_loss(input=q_local, target=q_target) self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_objective = self.critic_local(states, self.actor_local(states)).mean() self.actor_local.zero_grad() (-actor_objective).backward() self.actor_optimizer.step() Agent.soft_update(model_local=self.critic_local, model_target=self.critic_target, tau=self.tau) Agent.soft_update(model_local=self.actor_local, model_target=self.actor_target, tau=self.tau) @staticmethod def soft_update(model_local, model_target, tau): for local_param, target_param in zip(model_local.parameters(), model_target.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) @staticmethod def hard_update(model_local, model_target): Agent.soft_update(model_local=model_local, model_target=model_target, tau=1.0) def reset(self): self.noise.reset()
class DDPG: def __init__(self, beta, epsilon, learning_rate, gamma, tau, hidden_size_dim0, hidden_size_dim1, num_inputs, action_space, train_mode, alpha, replay_size, optimizer, two_player, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.alpha = alpha self.train_mode = train_mode self.num_inputs = num_inputs self.action_space = action_space self.critic_l2_reg = critic_l2_reg self.actor = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if self.train_mode: self.actor_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.actor_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.actor_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if(optimizer == 'SGLD'): self.actor_optim = SGLD(self.actor.parameters(), lr=1e-4, noise=epsilon, alpha=0.999) elif(optimizer == 'RMSprop'): self.actor_optim = RMSprop(self.actor.parameters(), lr=1e-4, alpha=0.999) else: self.actor_optim = ExtraAdam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.critic_target = Critic(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) self.adversary_target = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary_bar = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) self.adversary_outer = Actor(hidden_size_dim0, hidden_size_dim1, self.num_inputs, self.action_space).to(self.device) if(optimizer == 'SGLD'): self.adversary_optim = SGLD(self.adversary.parameters(), lr=1e-4, noise=epsilon, alpha=0.999) elif(optimizer == 'RMSprop'): self.adversary_optim = RMSprop(self.adversary.parameters(), lr=1e-4, alpha=0.999) else: self.adversary_optim = ExtraAdam(self.adversary.parameters(), lr=1e-4) hard_update(self.adversary_target, self.adversary) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.gamma = gamma self.tau = tau self.beta = beta self.epsilon = epsilon self.learning_rate = learning_rate self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns self.optimizer = optimizer self.two_player = two_player if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size) def eval(self): self.actor.eval() self.adversary.eval() if self.train_mode: self.critic.eval() def train(self): self.actor.train() self.adversary.train() if self.train_mode: self.critic.train() def select_action(self, state, action_noise=None, param_noise=None, mdp_type='mdp'): state = normalize(Variable(state).to(self.device), self.obs_rms, self.device) if mdp_type != 'mdp': if(self.optimizer == 'SGLD' and self.two_player): mu = self.actor_outer(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) * (1 - self.alpha) if(self.optimizer == 'SGLD' and self.two_player): adv_mu = self.adversary_outer(state) else: adv_mu = self.adversary(state) adv_mu = adv_mu.data.clamp(-1, 1) * self.alpha mu += adv_mu else: if(self.optimizer == 'SGLD' and self.two_player): mu = self.actor_outer(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) return mu def update_robust_non_flip(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, mdp_type, robust_update_type): # TRAIN CRITIC if robust_update_type == 'full': next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \ + self.alpha * self.adversary_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() value_loss = value_loss.item() else: value_loss = 0 # TRAIN ADVERSARY self.adversary_optim.zero_grad() with torch.no_grad(): if(self.optimizer == 'SGLD' and self.two_player): real_action = self.actor_outer(next_state_batch) else: real_action = self.actor_target(next_state_batch) action = (1 - self.alpha) * real_action + self.alpha * self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) adversary_loss = adversary_loss.mean() adversary_loss.backward() self.adversary_optim.step() adversary_loss = adversary_loss.item() # TRAIN ACTOR self.actor_optim.zero_grad() with torch.no_grad(): if(self.optimizer == 'SGLD' and self.two_player): adversary_action = self.adversary_outer(next_state_batch) else: adversary_action = self.adversary_target(next_state_batch) action = (1 - self.alpha) * self.actor(next_state_batch) + self.alpha * adversary_action policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() return value_loss, policy_loss, adversary_loss def update_robust_flip(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type): # TRAIN CRITIC if robust_update_type == 'full': next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \ + self.alpha * self.adversary_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() value_loss = value_loss.item() else: value_loss = 0 if adversary_update: # TRAIN ADVERSARY self.adversary_optim.zero_grad() with torch.no_grad(): real_action = self.actor_target(next_state_batch) action = (1 - self.alpha) * real_action + self.alpha * self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) adversary_loss = adversary_loss.mean() adversary_loss.backward() self.adversary_optim.step() adversary_loss = adversary_loss.item() policy_loss = 0 else: # TRAIN ACTOR self.actor_optim.zero_grad() with torch.no_grad(): adversary_action = self.adversary_target(next_state_batch) action = (1 - self.alpha) * self.actor(next_state_batch) + self.alpha * adversary_action policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 return value_loss, policy_loss, adversary_loss def update_non_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch): # TRAIN CRITIC next_action_batch = self.actor_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() # TRAIN ACTOR self.actor_optim.zero_grad() action = self.actor(next_state_batch) policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 return value_loss.item(), policy_loss, adversary_loss def store_transition(self, state, action, mask, next_state, reward): B = state.shape[0] for b in range(B): self.memory.push(state[b], action[b], mask[b], next_state[b], reward[b]) if self.normalize_observations: self.obs_rms.update(state[b].cpu().numpy()) if self.normalize_returns: self.ret = self.ret * self.gamma + reward[b] self.ret_rms.update(np.array([self.ret])) if mask[b] == 0: # if terminal is True self.ret = 0 def update_parameters(self, batch_size, sgld_outer_update, mdp_type='mdp', exploration_method='mdp'): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) if mdp_type != 'mdp': robust_update_type = 'full' elif exploration_method != 'mdp': robust_update_type = 'adversary' else: robust_update_type = None state_batch = normalize(Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize(Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to(self.device).unsqueeze(1) next_state_batch = normalize(Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = 0 policy_loss = 0 adversary_loss = 0 if robust_update_type is not None: _value_loss, _policy_loss, _adversary_loss = self.update_robust_non_flip(state_batch, action_batch, reward_batch, mask_batch, next_state_batch, mdp_type, robust_update_type) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if robust_update_type != 'full': _value_loss, _policy_loss, _adversary_loss = self.update_non_robust(state_batch, action_batch, reward_batch, mask_batch, next_state_batch) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if(self.optimizer == 'SGLD' and self.two_player): self.sgld_inner_update() self.soft_update() if(sgld_outer_update and self.optimizer == 'SGLD' and self.two_player): self.sgld_outer_update() return value_loss, policy_loss, adversary_loss def initialize(self): hard_update(self.actor_bar, self.actor_outer) hard_update(self.adversary_bar, self.adversary_outer) hard_update(self.actor, self.actor_outer) hard_update(self.adversary, self.adversary_outer) def sgld_inner_update(self): #target source sgld_update(self.actor_bar, self.actor, self.beta) sgld_update(self.adversary_bar, self.adversary, self.beta) def sgld_outer_update(self): #target source sgld_update(self.actor_outer, self.actor_bar, self.beta) sgld_update(self.adversary_outer, self.adversary_bar, self.beta) def soft_update(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.adversary_target, self.adversary, self.tau) soft_update(self.critic_target, self.critic, self.tau)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) if self.t_step % UPDATE_EVERY == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for i in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += 0.4 * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class IAC(): def __init__(self,action_dim,state_dim,agentParam,useLaw,useCenCritc,num_agent,CNN=False, width=None, height=None, channel=None): self.CNN = CNN self.device = agentParam["device"] if CNN: self.CNN_preprocessA = CNN_preprocess(width,height,channel) self.CNN_preprocessC = CNN_preprocess(width,height,channel) state_dim = self.CNN_preprocessA.get_state_dim() #if agentParam["ifload"]: #self.actor = torch.load(agentParam["filename"]+"actor_"+agentParam["id"]+".pth",map_location = torch.device('cuda')) #self.critic = torch.load(agentParam["filename"]+"critic_"+agentParam["id"]+".pth",map_location = torch.device('cuda')) #else: if useLaw: self.actor = ActorLaw(action_dim,state_dim).to(self.device) else: self.actor = Actor(action_dim,state_dim).to(self.device) if useCenCritc: self.critic = Centralised_Critic(state_dim,num_agent).to(self.device) else: self.critic = Critic(state_dim).to(self.device) self.action_dim = action_dim self.state_dim = state_dim self.noise_epsilon = 0.99 self.constant_decay = 0.1 self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr = 0.001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr = 0.001) self.lr_scheduler = {"optA":torch.optim.lr_scheduler.StepLR(self.optimizerA,step_size=1000,gamma=0.9,last_epoch=-1), "optC":torch.optim.lr_scheduler.StepLR(self.optimizerC,step_size=1000,gamma=0.9,last_epoch=-1)} if CNN: # self.CNN_preprocessA = CNN_preprocess(width,height,channel) # self.CNN_preprocessC = CNN_preprocess self.optimizerA = torch.optim.Adam(itertools.chain(self.CNN_preprocessA.parameters(),self.actor.parameters()),lr=0.0001) self.optimizerC = torch.optim.Adam(itertools.chain(self.CNN_preprocessC.parameters(),self.critic.parameters()),lr=0.001) self.lr_scheduler = {"optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=10000, gamma=0.9, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=10000, gamma=0.9, last_epoch=-1)} # self.act_prob # self.act_log_prob #@torchsnooper.snoop() def choose_action(self,s): s = torch.Tensor(s).unsqueeze(0).to(self.device) if self.CNN: s = self.CNN_preprocessA(s.reshape((1,3,15,15))) self.act_prob = self.actor(s) + torch.abs(torch.randn(self.action_dim)*0.05*self.constant_decay).to(self.device) self.constant_decay = self.constant_decay*self.noise_epsilon self.act_prob = self.act_prob/torch.sum(self.act_prob).detach() m = torch.distributions.Categorical(self.act_prob) # self.act_log_prob = m.log_prob(m.sample()) temp = m.sample() return temp def choose_act_prob(self,s): s = torch.Tensor(s).unsqueeze(0).to(self.device) self.act_prob = self.actor(s,[],False) return self.act_prob.detach() def choose_mask_action(self,s,pi): s = torch.Tensor(s).unsqueeze(0).to(self.device) if self.CNN: s = self.CNN_preprocessA(s.reshape((1,3,15,15))) self.act_prob = self.actor(s,pi,True) + torch.abs(torch.randn(self.action_dim)*0.05*self.constant_decay).to(self.device) self.constant_decay = self.constant_decay*self.noise_epsilon self.act_prob = self.act_prob/torch.sum(self.act_prob).detach() m = torch.distributions.Categorical(self.act_prob) # self.act_log_prob = m.log_prob(m.sample()) temp = m.sample() return temp def cal_tderr(self,s,r,s_,A_or_C=None): s = torch.Tensor(s).unsqueeze(0).to(self.device) s_ = torch.Tensor(s_).unsqueeze(0).to(self.device) if self.CNN: if A_or_C == 'A': s = self.CNN_preprocessA(s.reshape(1,3,15,15)) s_ = self.CNN_preprocessA(s_.reshape(1,3,15,15)) else: s = self.CNN_preprocessC(s.reshape(1,3,15,15)) s_ = self.CNN_preprocessC(s_.reshape(1,3,15,15)) v_ = self.critic(s_).detach() v = self.critic(s) return r + 0.9*v_ - v def td_err_sn(self, s_n, r, s_n_): s = torch.Tensor(s_n).reshape((1,-1)).unsqueeze(0).to(self.device) s_ = torch.Tensor(s_n_).reshape((1,-1)).unsqueeze(0).to(self.device) v = self.critic(s) v_ = self.critic(s_).detach() return r + 0.9*v_ - v def LearnCenCritic(self, s_n, r, s_n_): td_err = self.td_err_sn(s_n,r,s_n_) loss = torch.mul(td_err,td_err) self.optimizerC.zero_grad() loss.backward() self.optimizerC.step() self.lr_scheduler["optC"].step() def learnCenActor(self,s_n,r,s_n_,a): td_err = self.td_err_sn(s_n,r,s_n_) m = torch.log(self.act_prob[0][a]) temp = m*td_err.detach() loss = -torch.mean(temp) self.optimizerA.zero_grad() loss.backward() self.optimizerA.step() self.lr_scheduler["optA"].step() def learnCritic(self,s,r,s_): td_err = self.cal_tderr(s,r,s_) loss = torch.mul(td_err,td_err) self.optimizerC.zero_grad() loss.backward() self.optimizerC.step() self.lr_scheduler["optC"].step() #@torchsnooper.snoop() def learnActor(self,s,r,s_,a): td_err = self.cal_tderr(s,r,s_) m = torch.log(self.act_prob[0][a]) temp = m*td_err.detach() loss = -torch.mean(temp) self.optimizerA.zero_grad() loss.backward() self.optimizerA.step() self.lr_scheduler["optA"].step() def update_cent(self,s,r,s_,a,s_n,s_n_): self.LearnCenCritic(s_n,r,s_n_) self.learnCenActor(s_n,r,s_n_,a) def update(self,s,r,s_,a): self.learnCritic(s,r,s_) self.learnActor(s,r,s_,a)
class DDPG: def __init__(self, gamma, tau, hidden_size, num_inputs, action_space, train_mode, alpha, replay_size, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2): if torch.cuda.is_available(): self.device = torch.device('cuda') torch.backends.cudnn.enabled = False self.Tensor = torch.cuda.FloatTensor else: self.device = torch.device('cpu') self.Tensor = torch.FloatTensor self.alpha = alpha self.train_mode = train_mode self.num_inputs = num_inputs self.action_space = action_space self.critic_l2_reg = critic_l2_reg self.actor = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) if self.train_mode: self.actor_target = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.actor_perturbed = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(hidden_size, self.num_inputs, self.action_space).to(self.device) self.critic_target = Critic(hidden_size, self.num_inputs, self.action_space).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) self.adversary_target = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary_perturbed = Actor(hidden_size, self.num_inputs, self.action_space).to(self.device) self.adversary_optim = Adam(self.adversary.parameters(), lr=1e-4) hard_update( self.adversary_target, self.adversary) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.gamma = gamma self.tau = tau self.normalize_observations = normalize_obs self.normalize_returns = normalize_returns if self.normalize_observations: self.obs_rms = RunningMeanStd(shape=num_inputs) else: self.obs_rms = None if self.normalize_returns: self.ret_rms = RunningMeanStd(shape=1) self.ret = 0 self.cliprew = 10.0 else: self.ret_rms = None self.memory = ReplayMemory(replay_size) def eval(self): self.actor.eval() self.adversary.eval() if self.train_mode: self.critic.eval() def train(self): self.actor.train() self.adversary.train() if self.train_mode: self.critic.train() def select_action(self, state, action_noise=None, param_noise=None, mdp_type='mdp'): state = normalize( Variable(state).to(self.device), self.obs_rms, self.device) if mdp_type != 'mdp': if mdp_type == 'nr_mdp': if param_noise is not None: mu = self.actor_perturbed(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) * (1 - self.alpha) if param_noise is not None: adv_mu = self.adversary_perturbed(state) else: adv_mu = self.adversary(state) adv_mu = adv_mu.data.clamp(-1, 1) * self.alpha mu += adv_mu else: # mdp_type == 'pr_mdp': if np.random.rand() < (1 - self.alpha): if param_noise is not None: mu = self.actor_perturbed(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) else: if param_noise is not None: mu = self.adversary_perturbed(state) else: mu = self.adversary(state) mu = mu.data.clamp(-1, 1) else: if param_noise is not None: mu = self.actor_perturbed(state) else: mu = self.actor(state) mu = mu.data if action_noise is not None: mu += self.Tensor(action_noise()).to(self.device) mu = mu.clamp(-1, 1) return mu def update_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type): # TRAIN CRITIC if robust_update_type == 'full': if mdp_type == 'nr_mdp': next_action_batch = (1 - self.alpha) * self.actor_target(next_state_batch) \ + self.alpha * self.adversary_target(next_state_batch) next_state_action_values = self.critic_target( next_state_batch, next_action_batch) else: # mdp_type == 'pr_mdp': next_action_actor_batch = self.actor_target(next_state_batch) next_action_adversary_batch = self.adversary_target( next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_actor_batch) * ( 1 - self.alpha) \ + self.critic_target(next_state_batch, next_action_adversary_batch) * self.alpha expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() value_loss = value_loss.item() else: value_loss = 0 if adversary_update: # TRAIN ADVERSARY self.adversary_optim.zero_grad() if mdp_type == 'nr_mdp': with torch.no_grad(): real_action = self.actor_target(next_state_batch) action = ( 1 - self.alpha ) * real_action + self.alpha * self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) else: # mdp_type == 'pr_mdp' action = self.adversary(next_state_batch) adversary_loss = self.critic(state_batch, action) * self.alpha adversary_loss = adversary_loss.mean() adversary_loss.backward() self.adversary_optim.step() adversary_loss = adversary_loss.item() policy_loss = 0 else: if robust_update_type == 'full': # TRAIN ACTOR self.actor_optim.zero_grad() if mdp_type == 'nr_mdp': with torch.no_grad(): adversary_action = self.adversary_target( next_state_batch) action = (1 - self.alpha) * self.actor( next_state_batch) + self.alpha * adversary_action policy_loss = -self.critic(state_batch, action) else: # mdp_type == 'pr_mdp': action = self.actor(next_state_batch) policy_loss = -self.critic(state_batch, action) * ( 1 - self.alpha) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 else: policy_loss = 0 adversary_loss = 0 return value_loss, policy_loss, adversary_loss def update_non_robust(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch): # TRAIN CRITIC next_action_batch = self.actor_target(next_state_batch) next_state_action_values = self.critic_target(next_state_batch, next_action_batch) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch) value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() # TRAIN ACTOR self.actor_optim.zero_grad() action = self.actor(next_state_batch) policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() policy_loss = policy_loss.item() adversary_loss = 0 return value_loss.item(), policy_loss, adversary_loss def store_transition(self, state, action, mask, next_state, reward): B = state.shape[0] for b in range(B): self.memory.push(state[b], action[b], mask[b], next_state[b], reward[b]) if self.normalize_observations: self.obs_rms.update(state[b].cpu().numpy()) if self.normalize_returns: self.ret = self.ret * self.gamma + reward[b] self.ret_rms.update(np.array([self.ret])) if mask[b] == 0: # if terminal is True self.ret = 0 def update_parameters(self, batch_size, mdp_type='mdp', adversary_update=False, exploration_method='mdp'): transitions = self.memory.sample(batch_size) batch = Transition(*zip(*transitions)) if mdp_type != 'mdp': robust_update_type = 'full' elif exploration_method != 'mdp': robust_update_type = 'adversary' else: robust_update_type = None state_batch = normalize( Variable(torch.stack(batch.state)).to(self.device), self.obs_rms, self.device) action_batch = Variable(torch.stack(batch.action)).to(self.device) reward_batch = normalize( Variable(torch.stack(batch.reward)).to(self.device).unsqueeze(1), self.ret_rms, self.device) mask_batch = Variable(torch.stack(batch.mask)).to( self.device).unsqueeze(1) next_state_batch = normalize( Variable(torch.stack(batch.next_state)).to(self.device), self.obs_rms, self.device) if self.normalize_returns: reward_batch = torch.clamp(reward_batch, -self.cliprew, self.cliprew) value_loss = 0 policy_loss = 0 adversary_loss = 0 if robust_update_type is not None: _value_loss, _policy_loss, _adversary_loss = self.update_robust( state_batch, action_batch, reward_batch, mask_batch, next_state_batch, adversary_update, mdp_type, robust_update_type) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss if robust_update_type != 'full': _value_loss, _policy_loss, _adversary_loss = self.update_non_robust( state_batch, action_batch, reward_batch, mask_batch, next_state_batch) value_loss += _value_loss policy_loss += _policy_loss adversary_loss += _adversary_loss self.soft_update() return value_loss, policy_loss, adversary_loss def soft_update(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.adversary_target, self.adversary, self.tau) soft_update(self.critic_target, self.critic, self.tau) def perturb_actor_parameters(self, param_noise): """Apply parameter noise to actor model, for exploration""" hard_update(self.actor_perturbed, self.actor) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] param += torch.randn(param.shape).to( self.device) * param_noise.current_stddev """Apply parameter noise to adversary model, for exploration""" hard_update(self.adversary_perturbed, self.adversary) params = self.adversary_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] param += torch.randn(param.shape).to( self.device) * param_noise.current_stddev
class SAC: def __init__(self, n_states, n_actions): # hyper parameters self.replay_size = 1000000 self.experience_replay = deque(maxlen=self.replay_size) self.n_actions = n_actions self.n_states = n_states self.lr = 0.0003 self.batch_size = 128 self.gamma = 0.99 self.H = -2 self.Tau = 0.01 # actor network self.actor = Actor(n_states=n_states, n_actions=n_actions).to(DEVICE) # dual critic network, with corresponding targets self.critic = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) self.critic2 = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) self.target_critic = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) self.target_critic2 = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) # make the target critics start off same as the main networks for target_param, local_param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(local_param) for target_param, local_param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(local_param) # temperature variable self.log_alpha = torch.tensor(0.0, device=DEVICE, requires_grad=True) self.optim_alpha = Adam(params=[self.log_alpha], lr=self.lr) self.alpha = 0.2 self.optim_actor = Adam(params=self.actor.parameters(), lr=self.lr) self.optim_critic = Adam(params=self.critic.parameters(), lr=self.lr) self.optim_critic_2 = Adam(params=self.critic2.parameters(), lr=self.lr) def get_v(self, state_batch, action, log_action_probs): # TODO: move code from main train() function return def train_actor(self, s_currs): # TODO: move code from main train() function return def train_alpha(self, s_currs, log_action_probs): # TODO: move code from main train() function return def train_critic(self, s_currs, a_currs, r, s_nexts, dones, a_nexts, log_action_probs_next): # TODO: move code from main train() function return def process_batch(self, x_batch): s_currs = torch.zeros((self.batch_size, self.n_states)) a_currs = torch.zeros((self.batch_size, self.n_actions)) r = torch.zeros((self.batch_size, 1)) s_nexts = torch.zeros((self.batch_size, self.n_states)) dones = torch.zeros((self.batch_size, 1)) for batch in range(self.batch_size): s_currs[batch] = x_batch[batch].s_curr a_currs[batch] = x_batch[batch].a_curr r[batch] = x_batch[batch].reward s_nexts[batch] = x_batch[batch].s_next dones[batch] = x_batch[batch].done dones = dones.float() return s_currs.to(DEVICE), a_currs.to(DEVICE), r.to( DEVICE), s_nexts.to(DEVICE), dones.to(DEVICE) def train(self, x_batch): s_currs, a_currs, r, s_nexts, dones = self.process_batch( x_batch=x_batch) a_nexts, log_action_probs_next = self.actor.get_action(s_nexts, train=True) predicts = self.critic(s_currs, a_currs) # (batch, actions) predicts2 = self.critic2(s_currs, a_currs) q_values = self.target_critic(s_nexts, a_nexts).detach() # (batch, 1) q_values_2 = self.target_critic2(s_nexts, a_nexts).detach() value = torch.min(q_values, q_values_2) - self.alpha * log_action_probs_next target = r + ((1 - dones) * self.gamma * value.detach()) loss = mse_loss_function(predicts, target) self.optim_critic.zero_grad() loss.backward() self.optim_critic.step() loss2 = mse_loss_function(predicts2, target) self.optim_critic_2.zero_grad() loss2.backward() self.optim_critic_2.step() sample_action, log_action_probs = self.actor.get_action(state=s_currs, train=True) q_values_new = self.critic(s_currs, sample_action) q_values_new_2 = self.critic2(s_currs, sample_action) loss_actor = (self.alpha * log_action_probs) - torch.min( q_values_new, q_values_new_2) loss_actor = torch.mean(loss_actor) self.optim_actor.zero_grad() loss_actor.backward() self.optim_actor.step() alpha_loss = torch.mean((-1 * torch.exp(self.log_alpha)) * (log_action_probs.detach() + self.H)) self.optim_alpha.zero_grad() alpha_loss.backward() self.optim_alpha.step() self.alpha = torch.exp(self.log_alpha) self.update_weights() return def update_weights(self): for target_param, local_param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(self.Tau * local_param.data + (1.0 - self.Tau) * target_param.data) for target_param, local_param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(self.Tau * local_param.data + (1.0 - self.Tau) * target_param.data)
class DDPG(Policy): def __init__(self, gamma, tau, num_inputs, action_space, replay_size, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2, num_outputs=1, entropy_coeff=0.1, action_coeff=0.1): super(DDPG, self).__init__(gamma=gamma, tau=tau, num_inputs=num_inputs, action_space=action_space, replay_size=replay_size, normalize_obs=normalize_obs, normalize_returns=normalize_returns) self.num_outputs = num_outputs self.entropy_coeff = entropy_coeff self.action_coeff = action_coeff self.critic_l2_reg = critic_l2_reg self.actor = Actor(self.num_inputs, self.action_space, self.num_outputs).to(self.device) self.actor_target = Actor(self.num_inputs, self.action_space, self.num_outputs).to(self.device) self.actor_perturbed = Actor(self.num_inputs, self.action_space, self.num_outputs).to(self.device) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(self.num_inputs + self.action_space.shape[0]).to( self.device) self.critic_target = Critic(self.num_inputs + self.action_space.shape[0]).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def eval(self): self.actor.eval() self.critic.eval() def train(self): self.actor.train() self.critic.train() def policy(self, actor, state): return actor(state), None def update_critic(self, state_batch, action_batch, reward_batch, mask_batch, next_state_batch): batch_size = state_batch.shape[0] with torch.no_grad(): tiled_next_state_batch = self._tile(next_state_batch, 0, self.num_outputs) next_action_batch, _, next_probs, _ = self.actor_target( next_state_batch) next_state_action_values = (self.critic_target( tiled_next_state_batch, next_action_batch.view( batch_size * self.num_outputs, -1))[0].view( batch_size, self.num_outputs) * next_probs).sum(-1).unsqueeze(-1) expected_state_action_batch = reward_batch + self.gamma * mask_batch * next_state_action_values self.critic_optim.zero_grad() state_action_batch = self.critic(state_batch, action_batch)[0] value_loss = F.mse_loss(state_action_batch, expected_state_action_batch) value_loss.backward() self.critic_optim.step() return value_loss.item() def update_actor(self, state_batch): batch_size = state_batch.shape[0] tiled_state_batch = self._tile(state_batch, 0, self.num_outputs) action_batch, _, probs, dist_entropy = self.actor(state_batch) policy_loss = -(self.critic_target( tiled_state_batch, action_batch.view(batch_size * self.num_outputs, -1))[0].view( batch_size, self.num_outputs) * probs).sum(-1) entropy_loss = dist_entropy * self.entropy_coeff action_mse = 0 action_batch = action_batch.view(batch_size, self.num_outputs, -1) for idx1 in range(self.num_outputs): for idx2 in range(idx1 + 1, self.num_outputs): action_mse += ( (action_batch[:, idx1, :] - action_batch[:, idx2, :])** 2).mean() * self.action_coeff / self.num_outputs return policy_loss - entropy_loss - action_mse def soft_update(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau)
class SAC: def __init__(self, env, lr=3e-4, gamma=0.99, polyak=5e-3, alpha=0.2, reward_scale=1.0, cuda=True, writer=None): state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] self.actor = Actor(state_size, action_size) self.critic = Critic(state_size, action_size) self.target_critic = Critic(state_size, action_size).eval() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr) self.q1_optimizer = optim.Adam(self.critic.q1.parameters(), lr=lr) self.q2_optimizer = optim.Adam(self.critic.q2.parameters(), lr=lr) self.target_critic.load_state_dict(self.critic.state_dict()) for param in self.target_critic.parameters(): param.requires_grad = False self.memory = ReplayMemory() self.gamma = gamma self.alpha = alpha self.polyak = polyak # Always between 0 and 1, usually close to 1 self.reward_scale = reward_scale self.writer = writer self.cuda = cuda if cuda: self.actor = self.actor.to('cuda') self.critic = self.critic.to('cuda') self.target_critic = self.target_critic.to('cuda') def explore(self, state): if self.cuda: state = torch.tensor(state).unsqueeze(0).to('cuda', torch.float) action, _, _ = self.actor.sample(state) # action, _ = self.actor(state) return action.cpu().detach().numpy().reshape(-1) def exploit(self, state): if self.cuda: state = torch.tensor(state).unsqueeze(0).to('cuda', torch.float) _, _, action = self.actor.sample(state) return action.cpu().detach().numpy().reshape(-1) def store_step(self, state, action, next_state, reward, terminal): state = to_tensor_unsqueeze(state) if action.dtype == np.float32: action = torch.from_numpy(action) next_state = to_tensor_unsqueeze(next_state) reward = torch.from_numpy(np.array([reward]).astype(np.float)) terminal = torch.from_numpy(np.array([terminal]).astype(np.uint8)) self.memory.push(state, action, next_state, reward, terminal) def target_update(self, target_net, net): for t, s in zip(target_net.parameters(), net.parameters()): # t.data.copy_(t.data * (1.0 - self.polyak) + s.data * self.polyak) t.data.mul_(1.0 - self.polyak) t.data.add_(self.polyak * s.data) def calc_target_q(self, next_states, rewards, terminals): with torch.no_grad(): next_action, entropy, _ = self.actor.sample( next_states) # penalty term next_q1, next_q2 = self.target_critic(next_states, next_action) next_q = torch.min(next_q1, next_q2) - self.alpha * entropy target_q = rewards * self.reward_scale + ( 1. - terminals) * self.gamma * next_q return target_q def calc_critic_loss(self, states, actions, next_states, rewards, terminals): q1, q2 = self.critic(states, actions) target_q = self.calc_target_q(next_states, rewards, terminals) q1_loss = torch.mean((q1 - target_q).pow(2)) q2_loss = torch.mean((q2 - target_q).pow(2)) return q1_loss, q2_loss def calc_actor_loss(self, states): action, entropy, _ = self.actor.sample(states) q1, q2 = self.critic(states, action) q = torch.min(q1, q2) # actor_loss = torch.mean(-q - self.alpha * entropy) actor_loss = (self.alpha * entropy - q).mean() return actor_loss, entropy def train(self, timestep, batch_size=256): if len(self.memory) < batch_size: return transitions = self.memory.sample(batch_size) transitions = Transition(*zip(*transitions)) if self.cuda: states = torch.cat(transitions.state).to('cuda') actions = torch.stack(transitions.action).to('cuda') next_states = torch.cat(transitions.next_state).to('cuda') rewards = torch.stack(transitions.reward).to('cuda') terminals = torch.stack(transitions.terminal).to('cuda') else: states = torch.cat(transitions.state) actions = torch.stack(transitions.action) next_states = torch.cat(transitions.next_state) rewards = torch.stack(transitions.reward) terminals = torch.stack(transitions.terminal) # Compute target Q func q1_loss, q2_loss = self.calc_critic_loss(states, actions, next_states, rewards, terminals) # Compute actor loss actor_loss, mean = self.calc_actor_loss(states) update_params(self.q1_optimizer, self.critic.q1, q1_loss) update_params(self.q2_optimizer, self.critic.q2, q2_loss) update_params(self.actor_optimizer, self.actor, actor_loss) # target update self.target_update(self.target_critic, self.critic) if timestep % 100 and self.writer: self.writer.add_scalar('Loss/Actor', actor_loss.item(), timestep) self.writer.add_scalar('Loss/Critic', q1_loss.item(), timestep) def save_weights(self, path): self.actor.save(os.path.join(path, 'actor')) self.critic.save(os.path.join(path, 'critic'))
class DDPGAgent(object): """ General class for DDPG agents (policy, critic, target policy, target critic, exploration noise) """ def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120, hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64, max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False): """ Inputs: num_in_pol (int): number of dimensions for policy input num_out_pol (int): number of dimensions for policy output num_in_critic (int): number of dimensions for critic input """ self.policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) self.target_policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.target_critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) hard_update(self.target_policy, self.policy) hard_update(self.target_critic, self.critic) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0) self.policy = self.policy.float() self.critic = self.critic.float() self.target_policy = self.target_policy.float() self.target_critic = self.target_critic.float() self.agent_name = agent_name self.gamma = gamma self.tau = tau self.batch_size = batch_size #self.replay_buffer = ReplayBuffer(1e7) self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12) self.max_replay_buffer_len = batch_size * max_episode_len self.replay_sample_index = None self.niter = 0 self.eps = 5.0 self.eps_decay = 1/(250*5) self.exploration = OUNoise(num_out_pol) self.discrete_action = discrete_action self.num_history = 2 self.states = [] self.actions = [] self.rewards = [] self.next_states = [] self.dones = [] def reset_noise(self): if not self.discrete_action: self.exploration.reset() def scale_noise(self, scale): if self.discrete_action: self.exploration = scale else: self.exploration.scale = scale def act(self, obs, explore=False): """ Take a step forward in environment for a minibatch of observations Inputs: obs : Observations for this agent explore (boolean): Whether or not to add exploration noise Outputs: action (PyTorch Variable): Actions for this agent """ #obs = obs.reshape(1,48) state = Variable(torch.Tensor(obs),requires_grad=False) self.policy.eval() with torch.no_grad(): action = self.policy(state) self.policy.train() # continuous action if explore: action += Variable(Tensor(self.eps * self.exploration.sample()),requires_grad=False) action = torch.clamp(action, min=-1, max=1) return action def step(self, agent_id, state, action, reward, next_state, done,t_step): self.states.append(state) self.actions.append(action) self.rewards.append(reward) self.next_states.append(next_state) self.dones.append(done) #self.replay_buffer.add(state, action, reward, next_state, done) if t_step % self.num_history == 0: # Save experience / reward self.replay_buffer.add(self.states, self.actions, self.rewards, self.next_states, self.dones) self.states = [] self.actions = [] self.rewards = [] self.next_states = [] self.dones = [] # Learn, if enough samples are available in memory if len(self.replay_buffer) > self.batch_size: obs, acs, rews, next_obs, don = self.replay_buffer.sample() self.update(agent_id ,obs, acs, rews, next_obs, don,t_step) def update(self, agent_id, obs, acs, rews, next_obs, dones ,t_step, logger=None): obs = torch.from_numpy(obs).float() acs = torch.from_numpy(acs).float() rews = torch.from_numpy(rews[:,agent_id]).float() next_obs = torch.from_numpy(next_obs).float() dones = torch.from_numpy(dones[:,agent_id]).float() acs = acs.view(-1,2) # --------- update critic ------------ # self.critic_optimizer.zero_grad() all_trgt_acs = self.target_policy(next_obs) target_value = (rews + self.gamma * self.target_critic(next_obs,all_trgt_acs) * (1 - dones)) actual_value = self.critic(obs,acs) vf_loss = MSELoss(actual_value, target_value.detach()) # Minimize the loss vf_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() # --------- update actor --------------- # self.policy_optimizer.zero_grad() if self.discrete_action: curr_pol_out = self.policy(obs) curr_pol_vf_in = gumbel_softmax(curr_pol_out, hard=True) else: curr_pol_out = self.policy(obs) curr_pol_vf_in = curr_pol_out pol_loss = -self.critic(obs,curr_pol_vf_in).mean() #pol_loss += (curr_pol_out**2).mean() * 1e-3 pol_loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1) self.policy_optimizer.step() self.update_all_targets() self.eps -= self.eps_decay self.eps = max(self.eps, 0) if logger is not None: logger.add_scalars('agent%i/losses' % self.agent_name, {'vf_loss': vf_loss, 'pol_loss': pol_loss}, self.niter) def update_all_targets(self): """ Update all target networks (called after normal updates have been performed for each agent) """ soft_update(self.critic, self.target_critic, self.tau) soft_update(self.policy, self.target_policy, self.tau) def get_params(self): return {'policy': self.policy.state_dict(), 'critic': self.critic.state_dict(), 'target_policy': self.target_policy.state_dict(), 'target_critic': self.target_critic.state_dict(), 'policy_optimizer': self.policy_optimizer.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict()} def load_params(self, params): self.policy.load_state_dict(params['policy']) self.critic.load_state_dict(params['critic']) self.target_policy.load_state_dict(params['target_policy']) self.target_critic.load_state_dict(params['target_critic']) self.policy_optimizer.load_state_dict(params['policy_optimizer']) self.critic_optimizer.load_state_dict(params['critic_optimizer'])
class Preyer: def __init__(self, s_dim, a_dim, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.device = 'cuda' if self.config.use_cuda else 'cpu' self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, 1) self.critic_target = Critic(s_dim, a_dim, 1) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.c_loss = 0 self.a_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def choose_action(self, s, noisy=True): if self.config.use_cuda: s = Variable(torch.cuda.FloatTensor(s)) else: s = Variable(torch.FloatTensor(s)) a = self.actor.forward(s).cpu().detach().numpy() if noisy: a += max(self.epsilon, 0.001) * self.random_process.sample() self.epsilon -= self.depsilon a = np.clip(a, -1., 1.) return np.array([a]) def random_action(self): action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim)) return action def reset(self): self.random_process.reset_states() def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = Variable(torch.Tensor(state_batches).to(self.device)) action_batches = Variable( torch.Tensor(action_batches).reshape(-1, 1).to(self.device)) reward_batches = Variable( torch.Tensor(reward_batches).reshape(-1, 1).to(self.device)) next_state_batches = Variable( torch.Tensor(next_state_batches).to(self.device)) done_batches = Variable( torch.Tensor( (done_batches == False) * 1).reshape(-1, 1).to(self.device)) target_next_actions = self.actor_target.forward( next_state_batches).detach() target_next_q = self.critic_target.forward( next_state_batches, target_next_actions).detach() main_q = self.critic(state_batches, action_batches) # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines) loss_critic.backward() self.critic_optimizer.step() # Actor Loss self.actor.zero_grad() clear_action_batches = self.actor.forward(state_batches) loss_actor = ( -self.critic.forward(state_batches, clear_action_batches)).mean() loss_actor.backward() self.actor_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.actor, self.actor_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def getLoss(self): return self.c_loss, self.a_loss
class DDPGAgent: def __init__(self, state_size=24, action_size=2, seed=1, num_agents=2): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.num_agents = num_agents # DDPG specific configuration hidden_size = 512 self.CHECKPOINT_FOLDER = './' # Defining networks self.actor = Actor(state_size, hidden_size, action_size).to(device) self.actor_target = Actor(state_size, hidden_size, action_size).to(device) self.critic = Critic(state_size, self.action_size, hidden_size, 1).to(device) self.critic_target = Critic(state_size, self.action_size, hidden_size, 1).to(device) self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=ACTOR_LR) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=CRITIC_LR) # Noise self.noises = OUNoise((num_agents, action_size), seed) # Initialize replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def act(self, state, add_noise=True): ''' Returns action to be taken based on state provided as the input ''' state = torch.from_numpy(state).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor.eval() with torch.no_grad(): for agent_num, state in enumerate(state): action = self.actor(state).cpu().data.numpy() actions[agent_num, :] = action self.actor.train() if add_noise: actions += self.noises.sample() return np.clip(actions, -1, 1) def reset(self): self.noises.reset() def learn(self, experiences): ''' Trains the actor critic network using experiences ''' states, actions, rewards, next_states, dones = experiences # Update Critic actions_next = self.actor_target(next_states) # print(next_states.shape, actions_next.shape) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + GAMMA*Q_targets_next*(1-dones) Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.optimizer_critic.zero_grad() critic_loss.backward() self.optimizer_critic.step() # Update Actor actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() self.optimizer_actor.zero_grad() actor_loss.backward() self.optimizer_actor.step() # Updating the local networks self.soft_update(self.critic, self.critic_target) self.soft_update(self.actor, self.actor_target) def soft_update(self, model, model_target): tau = TAU for target_param, local_param in zip(model_target.parameters(), model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def step(self, state, action, reward, next_state, done): ''' Adds experience to memory and learns if the memory contains sufficient samples ''' for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) if len(self.memory) > BATCH_SIZE: # print("Now Learning") experiences = self.memory.sample() self.learn(experiences) def checkpoint(self): ''' Saves the actor critic network on disk ''' torch.save(self.actor.state_dict(), self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth') torch.save(self.critic.state_dict(), self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth') def load(self): ''' Loads the actor critic network from disk ''' self.actor.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')) self.actor_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')) self.critic.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')) self.critic_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'))
class Agent(object): def __init__(self, env, alpha, beta, tau, gamma, max_replay_size = 1000000, batch_size=64, l1_dim = 400, l2_dim = 300, state_dim = 8, action_dim = 2): self.env = env self.alpha = alpha # learning rate for actor network self.beta = beta # learning rate for critic network self.tau = tau # polyak averaging parameter self.gamma = gamma # discount factor of reward self.max_replay_size = max_replay_size self.batch_size = batch_size self.l1_dim = l1_dim self.l2_dim = l2_dim self.state_dim = state_dim self.action_dim = action_dim # build the agent self.build_agent() # with "tau = 1", we initialize the target network the same as the main network self.update_target_network(tau = 1) def build_agent(self): # build the actor-critic network and also their target networks self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim, self.alpha) self.target_actor = copy.deepcopy(self.actor) self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim, self.beta) self.target_critic = copy.deepcopy(self.critic) # build the replaybuffer self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim) # build the OUNoise for action selection self.noise = OUNoise(self.action_dim) def act(self, state): state = T.tensor(state, dtype=T.float) action = self.actor(state) noisy_action = action + T.tensor(self.noise(), dtype=T.float) return noisy_action.cpu().detach().numpy() # store transition into the replay buffer def remember(self, state, action, reward, next_state, done): self.replaybuffer.store(state, action, reward, next_state, done) def sample_replaybuffer(self): # sample from the ReplayBuffer states, actions, rewards, next_states, dones = self.replaybuffer.sample(self.batch_size) states = T.tensor(states, dtype=T.float) actions = T.tensor(actions, dtype=T.float) rewards = T.tensor(rewards, dtype=T.float) next_states = T.tensor(next_states, dtype=T.float) dones = T.tensor(dones) return states, actions, rewards, next_states, dones def step(self): # we cannot learn before the amount of transitions inside # the replay buffer is larger than the batch size if self.replaybuffer.mem_cntr < self.batch_size: return # get transition samples from replayer buffer states, actions, rewards, next_states, dones = self.sample_replaybuffer() # update the critic network self.update_critic(states, actions, rewards, next_states, dones) # update the actor network self.update_actor(states) # update target network parameters self.update_target_network() def update_critic(self, states, actions, rewards, next_states, dones): # update the critic network target_actions = self.target_actor(next_states) target_critic_values = self.target_critic(next_states, target_actions) critic_values = self.critic(states, actions) target_critic_values = [rewards[j] + self.gamma * target_critic_values[j] * dones[j] for j in range(self.batch_size)] # reshape the variable target_critic_values = T.tensor(target_critic_values) target_critic_values = target_critic_values.view(self.batch_size, 1) critic_loss = F.mse_loss(target_critic_values, critic_values) # In PyTorch, we need to set the gradients to zero before starting to do backpropragation # because PyTorch accumulates the gradients on subsequent backward passes # optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() def update_actor(self, states): # here we use the output from the actor network NOT the noisy action # because we only need to enforce exploration in the when actual interactions # happen in the environment actions = self.actor(states) # NOTICE here we do not multiply "actor_loss" with " self.actor(states)" # because we here take gradient with respect to the parameter not # first part to action and second to parameter (refer to the original paper) actor_loss = - self.critic(states, actions).mean() # optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() def update_target_network(self, tau=None): tau = self.tau if tau is None else tau # polyak averaging to update the target critic network for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) # polyak averaging to update the target actor network for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class Agent(object): def __init__(self, env, alpha, beta, tau, gamma, state_dim = 8, action_dim = 2, max_replay_size = 1000000, l1_dim = 400, l2_dim = 300, batch_size=64): self.env = env self.max_action = float(env.action_space.high[0]) self.alpha = alpha # learning rate for actor network self.beta = beta # learning rate for critic network self.tau = tau # polyak averaging parameter self.gamma = gamma # discount factor of reward self.update_actor_count = 0 self.update_actor_freq = 2 self.policy_noise = .2 self.noise_clip = .5 self.state_dim = state_dim self.action_dim = action_dim self.l1_dim = l1_dim self.l2_dim = l2_dim self.batch_size = batch_size self.max_replay_size = max_replay_size # build the agent self.build_agent() # with "tau = 1", we initialize the target network the same as the main network self.update_target_network(tau = 1) def build_agent(self): # build the actor-critic network and also their target networks self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha) self.target_actor = copy.deepcopy(self.actor) self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta) self.target_critic = copy.deepcopy(self.critic) # build the replaybuffer self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim) # build the OUNoise for action selection self.noise = OUNoise(self.action_dim) def act(self, state): state = T.tensor(state, dtype=T.float) action = self.actor(state) noisy_action = action + T.tensor(self.noise(), dtype=T.float) return noisy_action.cpu().detach().numpy() # store transition into the replay buffer def remember(self, state, action, reward, next_state, done): self.replaybuffer.store(state, action, reward, next_state, done) def sample_replaybuffer(self): # sample from the ReplayBuffer states, actions, rewards, next_states, dones = self.replaybuffer.sample(self.batch_size) states = T.tensor(states, dtype=T.float) actions = T.tensor(actions, dtype=T.float) rewards = T.tensor(rewards, dtype=T.float) next_states = T.tensor(next_states, dtype=T.float) dones = T.tensor(dones) return states, actions, rewards, next_states, dones def step(self): # we cannot learn before the amount of transitions inside # the replay buffer is larger than the batch size if self.replaybuffer.mem_cntr < self.batch_size: return self.update_actor_count += 1 # get transition samples from replayer buffer states, actions, rewards, next_states, dones = self.sample_replaybuffer() # update the critic network self.update_critic(states, actions, rewards, next_states, dones) if self.update_actor_count % self.update_actor_freq == 0: # update the actor network self.update_actor(states) # update target network parameters self.update_target_network() def update_critic(self, states, actions, rewards, next_states, dones): with T.no_grad(): # Select action according to policy and add clipped noise noise = (T.randn_like(actions) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.target_actor(next_states) + noise).clamp(-self.max_action, self.max_action) # Compute the target Q value and use the minimum of them target_Q1, target_Q2 = self.target_critic(next_states, next_action) target_Q = T.min(target_Q1, target_Q2) target_Q = [rewards[j] + self.gamma * target_Q[j] * dones[j] for j in range(self.batch_size)] # reshape the variable target_Q = T.tensor(target_Q) target_Q = target_Q.view(self.batch_size, 1) # Get current Q estimates current_Q1, current_Q2 = self.critic(states, actions) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # Optimize the critic self.critic.optimizer.zero_grad() critic_loss.backward() self.critic.optimizer.step() def update_actor(self, states): # here we use the output from the actor network NOT the noisy action # because we only need to enforce exploration in the when actual interactions # happen in the environment actions = self.actor(states) actor_loss = - self.critic.q1_forward(states, actions).mean() # Optimize the actor self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() def update_target_network(self, tau=None): tau = self.tau if tau is None else tau # polyak averaging to update the target critic network for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) # polyak averaging to update the target actor network for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = eps_start self.eps_decay = 1 / (eps_p * LEARN_NUM ) # set decay rate based on epsilon end target # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and timestep % 1 == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number != 0: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) else: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number != 0: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) else: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, eps_end) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size: int, action_size: int, seed: int, n_agent: int): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.n_agent = n_agent self.seed = random.seed(seed) self.global_step = 0 self.update_step = 0 # Initialize actor and critic local and target networks self.actor = Actor(state_size, action_size, seed, ACTOR_NETWORK_LINEAR_SIZES, batch_normalization=ACTOR_BATCH_NORM).to(device) self.actor_target = Actor( state_size, action_size, seed, ACTOR_NETWORK_LINEAR_SIZES, batch_normalization=ACTOR_BATCH_NORM).to(device) self.critic = Critic(state_size, action_size, seed, CRITIC_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.critic_second = Critic( state_size, action_size, seed, CRITIC_SECOND_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.critic_second_target = Critic( state_size, action_size, seed, CRITIC_SECOND_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.critic_target = Critic( state_size, action_size, seed, CRITIC_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LEARNING_RATE) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=CRITIC_LEARNING_RATE) self.critic_second_optimizer = optim.Adam( self.critic_second.parameters(), lr=CRITIC_LEARNING_RATE) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = [0] * n_agent self.noise = OUNoise(action_size, seed, decay_period=50) # Copy parameters from local network to target network for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_second_target.parameters(), self.critic_second.parameters()): target_param.data.copy_(param.data) def step(self, state: np.array, action, reward, next_state, done, i_agent): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step[i_agent] = (self.t_step[i_agent] + 1) % UPDATE_EVERY # Learn, if enough samples are available in memory every UPDATE_EVERY if len(self.memory) > BATCH_SIZE and (not any(self.t_step)): for _ in range(LEARN_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def noise_reset(self): self.noise.reset() def save_model(self, checkpoint_path: str = "./checkpoints/"): torch.save(self.actor.state_dict(), f"{checkpoint_path}/actor.pt") torch.save(self.critic.state_dict(), f"{checkpoint_path}/critic.pt") def load_model(self, checkpoint_path: str = "./checkpoints/checkpoint.pt"): self.actor.load_state_dict(torch.load(f"{checkpoint_path}/actor.pt")) self.critic.load_state_dict(torch.load(f"{checkpoint_path}/critic.pt")) def act(self, states: np.array, step: int): """Returns actions for given state as per current policy. Params ====== state (array_like): current state """ self.global_step += 1 if self.global_step < WARM_UP_STEPS: action_values = np.random.rand(self.n_agent, self.action_size) return action_values states = torch.from_numpy(states).float().to(device) self.actor.eval() with torch.no_grad(): action_values = self.actor(states).cpu().data.numpy() self.actor.train() action_values = [ self.noise.get_action(action, t=step) for action in action_values ] return action_values def learn(self, experiences: tuple, gamma=GAMMA): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ self.update_step += 1 states, actions, rewards, next_states, dones = experiences # Critic loss mask = torch.tensor(1 - dones).detach().to(device) Q_values = self.critic(states, actions) Q_values_second = self.critic_second(states, actions) next_actions = self.actor_target(next_states) next_Q = torch.min( self.critic_target(next_states, next_actions.detach()), self.critic_second_target(next_states, next_actions.detach())) Q_prime = rewards + gamma * next_Q * mask if self.update_step % CRITIC_UPDATE_EVERY == 0: critic_loss = F.mse_loss(Q_values, Q_prime.detach()) critic_second_loss = F.mse_loss(Q_values_second, Q_prime.detach()) # Update first critic network self.critic_optimizer.zero_grad() critic_loss.backward() if CRITIC_GRADIENT_CLIPPING_VALUE: torch.nn.utils.clip_grad_norm_(self.critic.parameters(), CRITIC_GRADIENT_CLIPPING_VALUE) self.critic_optimizer.step() # Update second critic network self.critic_second_optimizer.zero_grad() critic_second_loss.backward() if CRITIC_GRADIENT_CLIPPING_VALUE: torch.nn.utils.clip_grad_norm_(self.critic_second.parameters(), CRITIC_GRADIENT_CLIPPING_VALUE) self.critic_second_optimizer.step() # Actor loss if self.update_step % POLICY_UPDATE_EVERY == 0: policy_loss = -self.critic(states, self.actor(states)).mean() # Update actor network self.actor_optimizer.zero_grad() policy_loss.backward() if ACTOR_GRADIENT_CLIPPING_VALUE: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), ACTOR_GRADIENT_CLIPPING_VALUE) self.actor_optimizer.step() self.actor_soft_update() self.critic_soft_update() self.critic_second_soft_update() def actor_soft_update(self, tau: float = TAU): """Soft update for actor target network Args: tau (float, optional). Defaults to TAU. """ for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) def critic_soft_update(self, tau: float = TAU): """Soft update for critic target network Args: tau (float, optional). Defaults to TAU. """ for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.detach_() target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) def critic_second_soft_update(self, tau: float = TAU): """Soft update for critic target network Args: tau (float, optional). Defaults to TAU. """ for target_param, param in zip(self.critic_second_target.parameters(), self.critic_second.parameters()): target_param.detach_() target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
class DDPG(object): device = 'cuda' if torch.cuda.is_available() else 'cpu' veer = 'straight' origin = 'west' # state_dim : 状态维度 # action_dim: 动作维度 # max_action:动作限制向量 def __init__(self, state_dim, action_dim, max_action, origin_str, veer_str, logger): # 存在于 GPU 的神经网络 self.actor = Actor(state_dim, action_dim, max_action).to(self.device) # origin_network self.actor_target = Actor(state_dim, action_dim, max_action).to(self.device) # target_network self.actor_target.load_state_dict(self.actor.state_dict( )) # initiate actor_target with actor's parameters # pytorch 中的 tensor 默认requires_grad 属性为false,即不参与梯度传播运算,特别地,opimizer中模型参数是会参与梯度优化的 self.actor_optimizer = optim.Adam( self.actor.parameters(), pdata.LEARNING_RATE) # 以pdata.LEARNING_RATE指定学习率优化actor中的参数 self.critic = Critic(state_dim, action_dim).to(self.device) self.critic_target = Critic(state_dim, action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), pdata.LEARNING_RATE) # self.replay_buffer = Replay_buffer() # initiate replay-buffer self.replay_buffer = FilterReplayBuffer() self.writer = SummaryWriter(pdata.DIRECTORY + 'runs') self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.veer = veer_str self.origin = origin_str self.logger = logger def select_action(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) # numpy.ndarray.flatten(): 返回一个 ndarray对象的copy,并且将该ndarray压缩成一维数组 action = self.actor(state).cpu().data.numpy().flatten() return action # update the parameters in actor network and critic network # 只有 replay_buffer 中的 storage 超过了样本数量才会调用 update函数 def update(self): critic_loss_list = [] actor_performance_list = [] for it in range(pdata.UPDATE_ITERATION): # Sample replay buffer x, y, u, r, d = self.replay_buffer.sample( pdata.BATCH_SIZE ) # 随机获取 batch_size 个五元组样本(sample random minibatch) state = torch.FloatTensor(x).to(self.device) action = torch.FloatTensor(u).to(self.device) next_state = torch.FloatTensor(y).to(self.device) done = torch.FloatTensor(d).to(self.device) reward = torch.FloatTensor(r).to(self.device) # Compute the target Q value —— Q(S', A') is an value evaluated with next_state and predicted action # 这里的 target_Q 是 sample 个 一维tensor target_Q = self.critic_target(next_state, self.actor_target(next_state)) # detach(): Return a new tensor, detached from the current graph # evaluate Q: targetQ = R + γQ'(s', a') target_Q = reward + ( (1 - done) * pdata.GAMMA * target_Q).detach() # batch_size个维度 # Get current Q estimate current_Q = self.critic(state, action) # 1 维 # Compute critic loss : a mean-square error # 由论文,critic_loss 其实计算的是每个样本估计值与每个critic网络输出的均值方差 # torch.nn.functional.mse_loss 为计算tensor中各个元素的的均值方差 critic_loss = F.mse_loss(current_Q, target_Q) self.writer.add_scalar( 'critic_loss', critic_loss, global_step=self.num_critic_update_iteration) # self.logger.write_to_log('critic_loss:{loss}'.format(loss=critic_loss)) # self.logger.add_to_critic_buffer(critic_loss.item()) critic_loss_list.append(critic_loss.item()) # Optimize the critic self.critic_optimizer.zero_grad() # zeros the gradient buffer critic_loss.backward() # back propagation on a dynamic graph self.critic_optimizer.step() # Compute actor loss # actor_loss:见论文中对公式 (6) 的理解 # mean():对tensor对象求所有element的均值 # backward() 以梯度下降的方式更新参数,则将 actor_loss 设置为反向梯度,这样参数便往梯度上升方向更新 actor_loss = -self.critic(state, self.actor(state)).mean() self.writer.add_scalar('actor_performance', actor_loss, global_step=self.num_actor_update_iteration) # self.logger.write_to_log('actor_loss:{loss}'.format(loss=actor_loss)) # self.logger.add_to_actor_buffer(actor_loss.item()) actor_performance_list.append(actor_loss.item()) # Optimize the actor self.actor_optimizer.zero_grad( ) # Clears the gradients of all optimized torch.Tensor actor_loss.backward() self.actor_optimizer.step() # perform a single optimization step # 这里是两个 target网络的 soft update for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(pdata.TAU * param.data + (1 - pdata.TAU) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(pdata.TAU * param.data + (1 - pdata.TAU) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 actor_performance = np.mean(np.array(actor_performance_list)).item() self.logger.add_to_actor_buffer(actor_performance) critic_loss = np.mean(critic_loss_list).item() self.logger.add_to_critic_buffer(critic_loss) def save(self, mark_str): torch.save(self.actor.state_dict(), pdata.DIRECTORY + mark_str + '_actor.pth') torch.save(self.critic.state_dict(), pdata.DIRECTORY + mark_str + '_critic.pth') def load(self, mark_str): file_actor = pdata.DIRECTORY + mark_str + '_actor.pth' file_critic = pdata.DIRECTORY + mark_str + '_critic.pth' if os.path.exists(file_actor) and os.path.exists(file_critic): self.actor.load_state_dict(torch.load(file_actor)) self.critic.load_state_dict(torch.load(file_critic))
class Agent(): def __init__(self, state_size, action_size, replay_memory, random_seed=0, nb_agent=20, bs=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0, clip_actor=None, clip_critic=None, update_interval=20, update_times=10): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.nb_agent = nb_agent self.bs = bs self.update_interval = update_interval self.update_times = update_times self.timestep = 0 self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.wd_critic = wd_critic self.wd_actor = wd_actor self.clip_critic = clip_critic self.clip_actor = clip_actor self.actor_losses = [] self.critic_losses = [] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor, weight_decay=self.wd_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.wd_critic) # Noise process self.noise = OUNoise((self.nb_agent, action_size), random_seed) # Replay memory self.memory = replay_memory def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" #increment timestep self.timestep += 1 # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if self.timestep % self.update_interval == 0: for i in range(self.update_times): if len(self.memory) > self.bs: experiences = self.memory.sample(self.bs) self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset_noise(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.clip_critic: torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), self.clip_critic) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() if self.clip_actor: torch.nn.utils.clip_grad_norm(self.actor_local.parameters(), self.clip_actor) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) self.actor_losses.append(actor_loss.cpu().data.numpy()) self.critic_losses.append(critic_loss.cpu().data.numpy()) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class IAC(): def __init__(self, action_dim, state_dim, CNN=False, width=None, height=None, channel=None, device='cpu'): self.CNN = CNN if CNN: self.CNN_preprocessA = CNN_preprocess(width, height, channel) self.CNN_preprocessC = CNN_preprocess(width, height, channel) state_dim = self.CNN_preprocessA.get_state_dim() self.device = device self.actor = Actor(action_dim, state_dim) self.critic = Critic(state_dim) self.action_dim = action_dim self.state_dim = state_dim self.noise_epsilon = 0.999 self.constant_decay = 1 self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.00001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=1000, gamma=1, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=1000, gamma=0.9, last_epoch=-1) } if CNN: # self.CNN_preprocessA = CNN_preprocess(width,height,channel) # self.CNN_preprocessC = CNN_preprocess self.optimizerA = torch.optim.Adam(itertools.chain( self.CNN_preprocessA.parameters(), self.actor.parameters()), lr=0.0001) self.optimizerC = torch.optim.Adam(itertools.chain( self.CNN_preprocessC.parameters(), self.critic.parameters()), lr=0.001) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=10000, gamma=1, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=10000, gamma=0.9, last_epoch=-1) } # self.act_prob # self.act_log_prob def choose_action(self, s): s = torch.Tensor(s).unsqueeze(0).to(self.device) if self.CNN: s = self.CNN_preprocessA(s.reshape((1, 3, 15, 15))) self.act_prob = self.actor(s) + torch.abs( torch.randn(self.action_dim) * 0. * self.constant_decay) self.constant_decay = self.constant_decay * self.noise_epsilon self.act_prob = self.act_prob / torch.sum(self.act_prob).detach() m = torch.distributions.Categorical(self.act_prob) # self.act_log_prob = m.log_prob(m.sample()) temp = m.sample() return temp def cal_tderr(self, s, r, s_): s = torch.Tensor(s).unsqueeze(0).to(self.device) s_ = torch.Tensor(s_).unsqueeze(0).to(self.device) if self.CNN: s = self.CNN_preprocessC(s.reshape(1, 3, 15, 15)) s_ = self.CNN_preprocessC(s_.reshape(1, 3, 15, 15)) v_ = self.critic(s_).detach() v = self.critic(s) return r + 0.9 * v_ - v def learnCritic(self, s, r, s_): td_err = self.cal_tderr(s, r, s_) loss = torch.square(td_err) self.optimizerC.zero_grad() loss.backward() self.optimizerC.step() self.lr_scheduler["optC"].step() def learnActor(self, s, r, s_, a): td_err = self.cal_tderr(s, r, s_) m = torch.log(self.act_prob[0][a]).to( self.device ) #in cleanup there should not be a [0], in IAC the [0] is necessary temp = m * td_err.detach() loss = -torch.mean(temp) self.optimizerA.zero_grad() loss.backward() self.optimizerA.step() self.lr_scheduler["optA"].step() def update(self, s, r, s_, a): self.learnCritic(s, r, s_) self.learnActor(s, r, s_, a)