class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor( state_size, action_size, random_seed).to(device) self.actor_target = Actor( state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic( state_size, action_size, random_seed).to(device) self.critic_target = Critic( state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer( action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(20): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) # Learn, if enough samples are available in memory if timestep % 20 == 0: if len(self.memory) > BATCH_SIZE: for i in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: for i in range(20): action[i] += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_( tau*local_param.data + (1.0-tau)*target_param.data)
class DDPGAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, n_agents, seed, buffer_size=int(1e6), batch_size=200, lr_actor=1e-4, lr_critic=1e-3, gamma=0.99, weight_decay=0, tau=1e-3, update_frequency=20, n_learns=10): """Initialize a DDPG agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action n_agents (int): number of agents random_seed (int): random seed batch_size (int): minibatch size lr_actor (float): learning rate of the actor lr_critic (float): learning rate of the critic gamma (float): discount factor weight_decay (float): critic L2 weight decay tau (float): value for soft update of target parameters update_frequency (int): how much steps must be executed before starting learning n_learns (int): how many learning for update """ self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.gamma = gamma self.batch_size = batch_size self.tau = tau self.seed = random.seed(seed) self.update_frequency = update_frequency self.n_learns = n_learns # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = Ornstein((n_agents, action_size), seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, seed, device) # Initialize the time step (for every update_frequency steps) self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.update_frequency if self.t_step == 0: # Learn, if enough samples are available in memory for _ in range(self.n_learns): if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences, self.gamma) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: def __init__(self, dim_obs, dim_act, actor_lr=0.001, critic_lr=0.01, gamma=0.9, capacity=1000, batch_size=64, tau=0.01, hidden_size=64): self.gamma = gamma self.memory = ReplayMemory(capacity) self.batch_size = batch_size self.tau = tau self.device = 'gpu' if GPU_CONFIG.use_cuda else 'cpu' self.learn_cnt = 0 self.FloatTensor = th.cuda.FloatTensor if GPU_CONFIG.use_cuda else th.FloatTensor self.critic = Critic(dim_obs, dim_act, hidden_size).to(self.device) self.actor = Actor(dim_obs, dim_act, hidden_size).to(self.device) self.target_critic = Critic(dim_obs, dim_act, hidden_size).to(self.device) self.target_actor = Actor(dim_obs, dim_act, hidden_size).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.target_actor.load_state_dict(self.actor.state_dict()) # for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): # target_param.data.copy_(param.data) # 方式二: todo 试试 self.critic_optimizer = th.optim.Adam(self.critic.parameters(), lr=critic_lr) self.actor_optimizer = th.optim.Adam(self.actor.parameters(), lr=actor_lr) def learn(self): # sample batch from all memory transitions = self.memory.sample(self.batch_size) batch = Experience(*zip(*transitions)) # class(list) obs_batch = self.FloatTensor(np.array( batch.obs)) # torch.Size([32, 4]) # obs_batch = th.tensor(obs_batch, device=self.device, dtype=th.float) # 方法二 logger.debug("obs_batch: {}".format(obs_batch.shape)) action_batch = self.FloatTensor(np.array(batch.action)) # (batch, 1) logger.debug('action batch: {}'.format(action_batch.shape)) reward_batch = self.FloatTensor(np.array(batch.reward)).view( self.batch_size, 1) # (batch, 1) logger.debug('reward_batch: {}'.format(reward_batch.shape)) next_obs_batch = self.FloatTensor(np.array(batch.next_obs)) done_batch = self.FloatTensor(np.array(batch.done)).view( self.batch_size, 1) # (batch, 1) logger.debug('done_batch: {}'.format(done_batch)) # c loss self.critic_optimizer.zero_grad() q_eval = self.critic(obs_batch, action_batch) next_action = self.target_actor(next_obs_batch).detach() q_next = self.target_critic(next_obs_batch, next_action).detach() q_target = reward_batch + self.gamma * q_next * (1 - done_batch) c_loss = nn.MSELoss()(q_eval, q_target) c_loss.backward() self.critic_optimizer.step() # a loss self.actor_optimizer.zero_grad() current_action = self.actor(obs_batch) policy_loss = self.critic(obs_batch, current_action) a_loss = -policy_loss.mean() a_loss.backward() self.actor_optimizer.step() # soft update actor_target, critic_target soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_actor, self.actor, self.tau) a_loss = a_loss.detach().cpu().numpy( ) if GPU_CONFIG.use_cuda else a_loss.detach().numpy() c_loss = c_loss.detach().cpu().numpy( ) if GPU_CONFIG.use_cuda else c_loss.detach().numpy() return a_loss, c_loss @th.no_grad() def select_action(self, obs): obs = self.FloatTensor(obs).unsqueeze(0) action = self.actor(obs).detach() action = action.cpu().numpy() if GPU_CONFIG.use_cuda else action.numpy( ) return action
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """ Initialize the agent :param state_size: state space size :param action_size: action space size :param random_seed: seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Iteration counter self.step_counter = 0 @staticmethod def hard_copy_weights(target, source): """ copy weights from source to target network""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) self.step_counter += 1 # Learn, if enough samples are available in memory if self.step_counter % UPDATE_EVERY == 0: if len(self.memory) > BATCH_SIZE: for i in range(0, UPDATE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) self.step_counter = 0 def act(self, states, add_noise=False): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value :param experiences: tensor of (s,a,r,s') tuples :param gamma: discount factor :return: """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target :param local_model: local model where weights are copied from :param target_model: target model where weights are copied to :param tau: soft update rate :return: """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, device, memory, config): """Initialize an Agent object. Params ====== device (object): hardware device to run on CPU or GPU memory (object): memory for replay buffer config (dict) - "state_size": dimension of each state - "action_size": dimension of each action - "buffer_size": replay buffer size - "batch_size": minibatch size - "random_seed": random seed - "gamma": discount factor - "tau": for soft update of target parameters - "weight_decay": L2 weight decay - "learn_every": learn from replay buffer every time step - "learn_batch_size": number of batches to learn from replay buffer every learn_every time step - "grad_clip": gradient value to clip at for critic - "eps_start": starting value of epsilon, for epsilon-greedy action selection - "eps_end": minimum value of epsilon - "eps_decay": multiplicative factor (per episode) for decreasing epsilon - "print_every": Print average every x episode, - "episode_steps": Maximum number of steps to run for each episode - "mu": mu for noise - "theta": theta for noise - "sigma": sigma for noise - "actor": actor specific config object - "fc": array of input sizes for hidden layers - "learning_rate": learning rate - "critic": actor specific config object - "fc": array of input sizes for hidden layers - "learning_rate": learning rate """ self.num_agents = config['num_agents'] self.state_size = config['state_size'] self.action_size = config['action_size'] if config['random_seed'] is not None: self.seed = random.seed(config['random_seed']) else: self.seed = random.seed() self.eps = config['eps_start'] self.eps_decay = config['eps_decay'] self.eps_end = config['eps_end'] self.device = device # Replay memory self.memory = memory self.batch_size = config['batch_size'] self.gamma = config['gamma'] self.tau = config['tau'] self.lr_actor = config['actor']['learning_rate'] self.lr_critic = config['critic']['learning_rate'] self.weight_decay = config['weight_decay'] self.learn_every = config['learn_every'] self.learn_batch_size = config['learn_batch_size'] self.grad_clip = config['grad_clip'] # Actor Network (w/ Target Network) self.actor_local = Actor(config).to(self.device) self.actor_target = Actor(config).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(config).to(self.device) self.critic_target = Critic(config).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Noise process self.noise = OUNoise(config) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn, if enough samples are available in memory and every update_every time steps if len(self.memory ) > self.batch_size and timestep % self.learn_every == 0: for i in range(self.learn_batch_size): experiences = self.memory.sample() self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.eps * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def reset_episode(self): self.reset() self.memory.reset_episode() def learn_best_episode(self): self.learn(self.memory.sample(True)) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # gradient clipping for critic if self.grad_clip > 0: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.grad_clip) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) if self.eps_decay > 0: self.eps = max(self.eps_end, self.eps - self.eps_decay) # decrease epsilon self.reset() def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)