class MADDPGAgentVersion5(BaseAgent): def __init__(self, game, num_agents, state_size, action_size, name, random_seed=0, lr_critic=1e-3, lr_actor=1e-3, fc1_units=400, fc2_units=300, buffer_size=int(1e6), batch_size=128, gamma=0.99, tau=1e-3, max_norm=1.0, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99, exploration_mu=0.0, exploration_theta=0.15, exploration_sigma=0.2): """Initialize an Agent object. Args: game (class Game): meidator in chain-of-responsibility design pattern. (Broker chain) random_seed (int): random seed. max_norm (float): value of clip_grad_norm for critic optimizer """ super().__init__() self.index_agent = None self.game = game self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.name = name self.seed = random.seed(random_seed) self.max_norm = max_norm self.epsilon = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay # Actor Network (w/ Target Network) self.actor_local = MADDPGActorVersion2(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = MADDPGActorVersion2(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = MADDPGCriticVersion3(num_agents, state_size, action_size, fcs1_units=fc1_units, fc2_units=fc2_units, seed=random_seed).to(device) self.critic_target = MADDPGCriticVersion3(num_agents, state_size, action_size, fcs1_units=fc1_units, fc2_units=fc2_units, seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Noise process for action # Noise process self.noise = OUNoise(self.action_size, exploration_mu, exploration_theta, exploration_sigma) # parameter of discounted reward self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size def step(self, states, actions, rewards, next_states, dones): """ Args: states (numpy.array): states.shape[1] = (state_size*num_agents) actions (numpy.array): actions.shape[1] = (actions_size*num_agents) next_states (numpy.array): next_states.shape[1] = (state_size*num_agents) """ self.learn(states, actions, rewards, next_states, dones) def act(self, state, add_noise=True): """ Returns actions for given state. The input size of actor networks is state_size. """ state = torch.from_numpy(state).float().to(device) with torch.no_grad(): self.actor_local.eval() action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def forward_all(self, next_states): """ Get next_actions. This is a chain-of-responsibility design pattern. (Broker chain) Return: 1d differentiable tensor of next_actions. """ q = ActionQuery() for i, agent in enumerate(self.game): # get next_state_i of agent_i n_state = next_states[:, i*self.state_size: (i+1)*self.state_size] # pdb.set_trace() if agent == self: detach = False else: detach = True # predict next_action and append it to actionQuery.actions agent.query(n_state, q, detach) return q.next_actions def query(self, next_state, q, detach): """ Args: q (class ActionQuery): parcel that stores actions """ next_action = self.actor_local(next_state) if detach is True: next_action = next_action.detach() if q.next_actions is None: q.next_actions = next_action else: q.next_actions = torch.cat((q.next_actions, next_action), dim=1) # pdb.set_trace() def learn(self, states, actions, rewards, next_states, dones): """Update policy and value parameters using given batch of experience tuples. For agent i: Q_target_i = r_i + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> actions for all agent critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ # divide fields update agent number i experience_unpacks = ExperienceUnpack(states, actions, rewards, next_states, dones, self.state_size, self.action_size, self.num_agents) # upack field in agent_i if self.index_agent is None: self.index_agent = self.game.index_of_agent(self) # pdb.set_trace() states_i, actions_i, rewards_i, next_states_i, dones_i = experience_unpacks[self.index_agent] # assert (states_i.shape[1] == (self.state_size)), 'Wrong shape of states_i' # assert (actions_i.shape[1] == (self.action_size)), 'Wrong shape of actions_i' # assert (rewards_i.shape[1] == (1)), 'Wrong shape of rewards_i' # assert (dones_i.shape[1] == (1)), 'Wrong shape of dones_i' # train critic # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current) next_actions = self.forward_all(next_states) assert (next_actions.shape[1] == (self.action_size * self.num_agents)), 'Wrong shape of next_actions' Q_targets_next = self.critic_target(next_states, next_actions) Q_target_i = rewards_i + (self.gamma * Q_targets_next * (1-dones_i)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_target_i) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm) self.critic_optimizer.step() # train actor actions_pred = self.forward_all(states) actor_loss = - self.critic_local(states, actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # # update critic # self.soft_update(self.critic_local, self.critic_target, self.tau) # # update actors # self.soft_update(self.actor_local, self.actor_target, self.tau) #------ update noise ---# self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end) self.noise.reset() def update_targets(self): # update critic self.soft_update(self.critic_local, self.critic_target, self.tau) # update actors self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def model_dicts(self): m_dicts = {'critic_{}'.format(self.name): self.critic_target, 'actor_{}'.format(self.name): self.actor_target} return m_dicts
class DDPGAgentVersion1(BaseAgent): def __init__(self, state_size, action_size, random_seed, lr_actor=1e-2, lr_critic=1e-2, fc1_units=128, fc2_units=128, buffer_size=int(1e6), batch_size=50, gamma=0.95, tau=1e-2, max_norm=1.0, learn_period=100, learn_sampling_num=50, adam_critic_weight_decay=0.0, name=None, exploration_mu=0.0, exploration_sigma=0.2, exploration_theta=0.15): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num self.actor_local = DDPGActorVersion1(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActorVersion1(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCriticVersion1(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCriticVersion1(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=lr_critic, weight_decay=adam_critic_weight_decay) # Noise process for action # Noise process # self.exploration_mu = 0 # self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) # self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) self.exploration_mu = exploration_mu self.exploration_theta = exploration_theta # (Timothy Lillicrap, 2016) self.exploration_sigma = exploration_sigma # (Timothy Lillicrap, 2016) self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size self.name = name self.time_step = 0 def step(self, state, action, reward, next_state, done): self.time_step += 1 """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size) and (self.time_step % self.learn_period == 0): for _ in range(self.learn_sampling_num): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # train critic # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm) self.critic_optimizer.step() # train actor (policy gradient) actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update critic_target self.soft_update(self.critic_local, self.critic_target, self.tau) # update actor_target self.soft_update(self.actor_local, self.actor_target, self.tau) #------ update noise ---# self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def model_dicts(self): return { 'agent_{}_actor'.format(self.name): self.actor_target, 'agent_{}_critic'.format(self.name): self.critic_target }
class Brain: def __init__(self, agent_count, observation_size, action_size, actor_optim_params, critic_optim_params, soft_update_tau, discount_gamma, use_batch_norm, seed, actor_network_states, critic_network_states, device): self._soft_update_tau = soft_update_tau self._gamma = discount_gamma # actor networks self._actor_local = ActorNetwork( observation_size, action_size, use_batch_norm, seed ).to(device) self._actor_target = ActorNetwork( observation_size, action_size, use_batch_norm, seed ).to(device) # critic networks self._critic_local = CriticNetwork( observation_size * agent_count, action_size * agent_count, use_batch_norm, seed ).to(device) self._critic_target = CriticNetwork( observation_size * agent_count, action_size * agent_count, use_batch_norm, seed ).to(device) # optimizers self._actor_optimizer = optim.Adam( self._actor_local.parameters(), **actor_optim_params ) self._critic_optimizer = optim.Adam( self._critic_local.parameters(), **critic_optim_params ) if actor_network_states is not None: self._actor_local.load_state_dict(actor_network_states[0]) self._actor_target.load_state_dict(actor_network_states[1]) if critic_network_states is not None: self._critic_local.load_state_dict(critic_network_states[0]) self._critic_target.load_state_dict(critic_network_states[1]) self.noise = OUNoise(action_size, seed) def get_actor_model_states(self): return self._actor_local.state_dict(), self._actor_target.state_dict() def get_critic_model_states(self): return self._critic_local.state_dict(), self._critic_target.state_dict() def act(self, observation, target=False, noise=0.0, train=False): """ :param observation: tensor of shape == (b, observation_size) :param target: true to evaluate with target :param noise: OU noise factor :param train: True for training mode else eval mode :return: action: tensor of shape == (b, action_size) """ actor = self._actor_target if target else self._actor_local if train: actor.train() else: actor.eval() action_values = actor(observation) if noise > 0: noise = torch.tensor( noise * self.noise.sample(), dtype=observation.dtype, device=observation.device ) else: noise = 0 return action_values + noise def update_actor(self, all_obs, all_pred_actions): """ Actor :param all_obs: array of shape == (b, observation_size * n_agents) :param all_pred_actions: array of shape == (b, action_size * n_agents) :return: """ actor_loss = -self._critic_local(all_obs, all_pred_actions).mean() self._actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self._actor_optimizer.step() def update_critic(self, rewards, dones, all_obs, all_actions, all_next_obs, all_next_actions): """ Critic receives observation and actions of all agents as input :param rewards: array of shape == (b, 1) :param dones: array of shape == (b, 1) :param all_obs: array of shape == (b, n_agents, observation_size) :param all_actions: array of shape == (b, n_agents, action_size) :param all_next_obs: array of shape == (b, n_agents, observation_size) :param all_next_actions: array of shape == (b, n_agents, action_size) """ with torch.no_grad(): q_target_next = self._critic_target(all_next_obs, all_next_actions) q_target = rewards + self._gamma * q_target_next * (1 - dones) q_expected = self._critic_local(all_obs, all_actions) # mse loss, manual calculation due to mse_loss bug, as of 0.4.1 # https://github.com/pytorch/pytorch/issues/10148 # critic_loss = F.mse_loss(q_expected, q_target.detach()) critic_loss = ((q_expected - q_target.detach()) ** 2).mean() self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() def update_targets(self): self._soft_update(self._actor_local, self._actor_target, self._soft_update_tau) self._soft_update(self._critic_local, self._critic_target, self._soft_update_tau) def reset(self): self.noise.reset() @staticmethod def _soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ * θ_local + (1 - τ) * θ_target :param local_model: model will be copied from :param target_model: model will be copied to :param tau: interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgentVersion5(BaseAgent): def __init__(self, state_size, action_size, random_seed, lr_actor=1e-2, lr_critic=1e-2, fc1_units=128, fc2_units=128, buffer_size=int(1e6), batch_size=50, gamma=0.95, tau=1e-2, max_norm=1.0, learn_period=100, learn_sampling_num=50, adam_critic_weight_decay=0.0, name=None, exploration_mu=0.0, exploration_sigma=0.2, exploration_theta=0.15, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num self.epsilon = epsilon_start self.epsilon_end = epsilon_end self.epsilon_decay = epsilon_decay self.actor_local = DDPGActorVersion1(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActorVersion1(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCriticVersion1(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCriticVersion1(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=lr_critic, weight_decay=adam_critic_weight_decay) # Noise process for action # Noise process # self.exploration_mu = 0 # self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) # self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) self.exploration_mu = exploration_mu self.exploration_theta = exploration_theta # (Timothy Lillicrap, 2016) self.exploration_sigma = exploration_sigma # (Timothy Lillicrap, 2016) self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory #self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.memory = PrioritizedReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) # Prioritized Replay Buffer Params #self.a, self.b = 0.7, 0.5 # rank-based variant self.a, self.b = 0.6, 0.4 # proportional variant self.e = 1e-3 # 0.01 * (reward of each time step) = 0.01 * 0.1 # parameter of discounted reward self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size self.name = name self.time_step = 0 def step(self, state, action, reward, next_state, done): self.time_step += 1 """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size) and (self.time_step % self.learn_period == 0): for _ in range(self.learn_sampling_num): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, indices, probs = experiences # train critic # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current) actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # compute td error (delta) for updating prioritized replay buffer abs_td_error = torch.abs(Q_targets - Q_expected) # Calculate importance sampling weight if probs: weights = np.array(probs).reshape(-1, 1) * len( self.memory)**(-self.b) weights /= np.max(weights) #weights = [(prob * size_memory) ** (-self.b) for prob in probs] #max_weight = max(weights) #weights = np.array([w / max_weight for w in weights]).reshape((-1, 1)) else: weights = np.ones(critic_loss.shape, dtype=np.float) # Calculate weighted loss weighted_critic_loss = torch.mean( torch.from_numpy(weights).float().to(device) * critic_loss) self.critic_optimizer.zero_grad() weighted_critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm) self.critic_optimizer.step() if indices: # convert errors to priorities and update them self.memory.update( indices, list( abs_td_error.detach().to('cpu').numpy().squeeze()**self.a + self.e)) # train actor (policy gradient) actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update critic_target self.soft_update(self.critic_local, self.critic_target, self.tau) # update actor_target self.soft_update(self.actor_local, self.actor_target, self.tau) #------ update noise ---# self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def model_dicts(self): return { 'agent_{}_actor'.format(self.name): self.actor_target, 'agent_{}_critic'.format(self.name): self.critic_target }
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, buffer_size, batch_size, gamma, tau, actor_dropout, critic_dropout, exploration_theta, exploration_sigma, actor_lr, critic_lr): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_dropout = actor_dropout self.critic_dropout = critic_dropout self.actor_lr = actor_lr self.critic_lr = critic_lr # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_dropout, self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_dropout, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_dropout, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_dropout, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 5 self.exploration_theta = exploration_theta self.exploration_sigma = exploration_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = buffer_size self.batch_size = batch_size self.memory = PrioritizedReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.best_score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0.0 self.count = 0 return state def step(self, action, reward, next_state, done): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #Generate the parameters in order to calculate the TD error next_state_predict = np.reshape(next_state, [-1, self.state_size]) last_state_predict = np.reshape(self.last_state, [-1, self.state_size]) action_predict = np.reshape(action, [-1, self.action_size]) #next_state_action = np.concatenate([next_state, action]) Q_target_next = self.critic_target.model.predict( [next_state_predict, action_predict])[0] Q_local = self.critic_local.model.predict( [last_state_predict, action_predict])[0] #Calculate the TD error in order to generate the priority value of the experience td_error = reward + self.gamma * Q_target_next - Q_local #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf #td_error = math.tanh(td_error[0]) self.memory.add(self.last_state, action, reward, next_state, done, abs(td_error[0])) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences, idx_sample, is_weights = self.memory.sample_priority() self.learn(experiences, idx_sample, is_weights) # Roll over last state and action self.last_state = next_state def act(self, state, test=False): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if test == False: return list(action + self.noise.sample()) # add some noise for exploration else: return list(action) def learn(self, experiences, idx_sample, is_weights): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) is_weights = is_weights.reshape(-1, 1) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * ( 1 - dones) * is_weights self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) #Generate the new TD error value and update the priority value within the Replay Buffer td_error = rewards + self.gamma * Q_targets_next * (1 - dones) - Q_targets #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf #td_error = np.tanh(td_error) self.memory.update_priority(idx=idx_sample, error=td_error) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def test_control(self, file_output='data.txt'): state = self.reset_episode() done = False #Results with the conditions of the quadcopter labels = [ 'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity', 'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity', 'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3', 'rotor_speed4' ] results = {x: [] for x in labels} # Run the simulation, and save the results. with open(file_output, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(labels) while True: action = self.act(state, test=True) #action = self.act(state, test=False) next_state, reward, done = self.task.step(action) state = next_state to_write = [self.task.sim.time] + list( self.task.sim.pose) + list(self.task.sim.v) + list( self.task.sim.angular_v) + list(action) for ii in range(len(labels)): results[labels[ii]].append(to_write[ii]) writer.writerow(to_write) if done: break #Shows the results of the control control_results(results) #Useful for testing def update_score(self): self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score