class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.total_reward = 0.0 self.count = 0 self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 self.batch_size = 128 self.update_every = 1 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 1000000 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor interact with the environment through the step def step(self, state, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored experience tuple in the replay buffer self.memory.add(state, action, reward, next_state, done) # Learn every update_times time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action (not needed) # self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Because we are exploring we add some noise to the # action vector return list(actions.detach().numpy().reshape(4, ) + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every member of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Now reshape the numpy arrays for states, actions and next_states to torch tensors # rewards and dones does not need to be tensors. states = torch.from_numpy(states).float().unsqueeze(0).to(device) actions = torch.from_numpy(actions).float().unsqueeze(0).to(device) next_states = torch.from_numpy(next_states).float().unsqueeze(0).to( device) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.actor_target.eval() with torch.no_grad(): next_state_actions = self.actor_target(next_states).detach() self.actor_target.train() # The critic evaluates the actions taking by the actor in the next state and generates the # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or q_value function inputs is states, actions # We calculate the q_targets of the next state. We will use this to calculate the current # state q_value using the bellman equation. # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.critic_target.eval() with torch.no_grad(): q_targets_next_state_action_values = self.critic_target( next_states, next_state_actions).detach() self.actor_target.train() # With the next state q_value that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a). # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic_local model in a supervise learning fashion, this is the target values. q_targets = torch.from_numpy( rewards + self.gamma * q_targets_next_state_action_values.numpy() * (1 - dones)).float() # --- Optimize the local Critic Model ----# # Here we start the supervise training process of the critic_local network # we pass a bunch of states actions samples it produces the expected output # q_value of each action we passed. q_expected = self.critic_local(states, actions) # Clear grad buffer values in preparation. self.critic_optimizer.zero_grad() # loss function for the critic_local model mean square of the difference # between the q_expected value and the q_target value. critic_loss = F.smooth_l1_loss(q_expected, q_targets) critic_loss.backward(retain_graph=True) # gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # optimize the critic_local model using the optimizer defined for the critic # In the init function of this class self.critic_optimizer.step() # --- Optimize the local Actor Model ---# # Get the actor actions using the experience buffer states actor_actions = self.actor_local(states) # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the # action of the actor_local model obtain using the states of the sampled buffer. loss_actor = -1 * torch.sum( self.critic_local.forward(states, actor_actions)) # Set the model gradients to zero in preparation self.actor_optimizer.zero_grad() # Back propagate loss_actor.backward() # optimize the actor_local model using the optimizer defined for the actor # In the init function of this class self.actor_optimizer.step() # Soft-update target models self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
class Agent(object): def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) # add noise to action - for exploration mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def choose_action_no_train(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) return mu.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.push(state, action, reward, new_state, done) def learn(self): if self.memory.idx_last < self.batch_size: # not enough data in replay buffer return # select random events state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device) done = torch.tensor(done).to(self.critic.device) new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device) action = torch.tensor(action, dtype=torch.float).to(self.critic.device) state = torch.tensor(state, dtype=torch.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = torch.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) def save_models(self): timestamp = time.strftime("%Y%m%d-%H%M%S") self.actor.save("actor_" + timestamp) self.target_actor.save("target_actor_" + timestamp) self.critic.save("critic_" + timestamp) self.target_critic.save("target_critic_" + timestamp) def load_models(self, fn_actor, fn_target_actor, fn_critic, fn_target_critic): self.actor.load_checkpoint(fn_actor) self.target_actor.load_checkpoint(fn_target_actor) self.critic.load_checkpoint(fn_critic) self.target_critic.load_checkpoint(fn_target_critic)
class Agent: def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, actor_learning_rate=1e-4, critic_learning_rate=1e-3, discount_rate=0.99, tau=0.1, steps_per_update=4, action_range=None, dropout_p=0.0, weight_decay=0.0001, noise_max=0.2, noise_decay=1.0, n_agents=1 ): self.device: torch.device = device self.state_size = state_size self.action_size = action_size self.critic_control = Critic(state_size, action_size).to(device) self.critic_control.dropout.p = dropout_p self.critic_target = Critic(state_size, action_size).to(device) self.critic_target.eval() self.critic_optimizer = torch.optim.Adam( self.critic_control.parameters(), weight_decay=weight_decay, lr=critic_learning_rate) self.actor_control = Actor(state_size, action_size, action_range).to( device) self.actor_control.dropout.p = dropout_p self.actor_target = Actor(state_size, action_size, action_range).to( device) self.actor_target.eval() self.actor_optimizer = torch.optim.Adam( self.actor_control.parameters(), weight_decay=weight_decay, lr=actor_learning_rate) self.batch_size = batch_size self.min_buffer_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size) self.discount_rate = discount_rate self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update self.noise_max = noise_max self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max) self.noise_decay = noise_decay self.last_score = float('-inf') def policy(self, state, add_noise=True): state = torch.from_numpy(state).float().to(self.device) self.actor_control.eval() with torch.no_grad(): action = self.actor_control(state).cpu().numpy() self.actor_control.train() if add_noise: noise = self.noise.sample() action += noise return action def step(self, state, action, reward, next_state, done): p = self.calculate_p(state, action, reward, next_state, done) for i in range(state.shape[0]): self.replay_buffer.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i], p[i]) if self.step_count % self.steps_per_update == 0: self.learn() self.step_count += 1 def learn(self): if len(self.replay_buffer) < self.min_buffer_size: return indicies, (states, actions, rewards, next_states, dones, p) = \ self.replay_buffer.sample(self.batch_size) self.actor_control.eval() error = self.bellman_eqn_error( states, actions, rewards, next_states, dones) self.actor_control.train() importance_scaling = (self.replay_buffer.buffer_size * p) ** -1 importance_scaling /= importance_scaling.max() self.critic_optimizer.zero_grad() loss = (importance_scaling * (error ** 2)).sum() / self.batch_size loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() expected_actions = self.actor_control(states) critic_score = self.critic_control(states, expected_actions) loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size loss.backward() self.actor_optimizer.step() self.update_target(self.critic_control, self.critic_target) self.update_target(self.actor_control, self.actor_target) self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3) def bellman_eqn_error(self, states, actions, rewards, next_states, dones): """Double DQN error - use the control network to get the best action and apply the target network to it to get the target reward which is used for the bellman eqn error. """ next_actions = self.actor_control(next_states) target_action_values = self.critic_target(next_states, next_actions) target_rewards = ( rewards + self.discount_rate * (1 - dones) * target_action_values ) current_rewards = self.critic_control(states, actions) error = current_rewards - target_rewards return error def calculate_p(self, state, action, reward, next_state, done): next_state = torch.from_numpy(next_state).float().to( self.device) state = torch.from_numpy(state).float().to(self.device) action = torch.from_numpy(action).float().to(self.device) reward = torch.from_numpy(reward).float().to(self.device) done = torch.from_numpy(done).float().to( self.device) done = done.unsqueeze(1) reward = reward.unsqueeze(1) self.actor_control.eval() self.critic_control.eval() with torch.no_grad(): retval = abs( self.bellman_eqn_error(state, action, reward, next_state, done)) + 1e-3 self.critic_control.train() self.actor_control.train() return retval def update_target(self, control, target): for target_param, control_param in zip( target.parameters(), control.parameters()): target_param.data.copy_( self.tau * control_param.data + (1.0 - self.tau) * target_param.data) def end_of_episode(self, final_score): self.step_count = 0 self.noise.sigma *= self.noise_decay self.last_score = final_score self.noise.reset() def save(self, path): torch.save(self.critic_control.state_dict(), path + '-critic.p') torch.save(self.actor_control.state_dict(), path + '-actor.p') def restore(self, path): self.critic_control.load_state_dict( torch.load(path + '-critic.p', map_location='cpu')) self.actor_control.load_state_dict( torch.load(path + '-actor.p', map_location='cpu'))
class Agent: def __init__(self, state_size, action_size): self._state_size = state_size self._action_size = action_size # Actor network self._actor_local = Actor(state_size, action_size).to(device) self._actor_target = Actor(state_size, action_size).to(device) self._actor_optimizer = optim.Adam(self._actor_local.parameters()) # Critic network self._critic_local = Critic(state_size, action_size).to(device) self._critic_target = Critic(state_size, action_size).to(device) self._critic_optimizer = optim.Adam(self._critic_local.parameters()) # Memory self._memory = Memory(BUFFER_SIZE) # Do equal weights self.hard_update(self._actor_local, self._actor_target) self.hard_update(self._critic_local, self._critic_target) def step(self, state, action, reward, next_state, done): self._memory.push((state, action, reward, next_state, done)) if len(self._memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): samples = self._memory.sample(BATCH_SIZE) self.learn(samples) def act(self, state): state = torch.from_numpy(state).float().to(device) if binom.rvs(1, PROBABILITY_RAND_STEP): action = np.ndarray((1, ), buffer=np.array(uniform(-1, 1).rvs())) else: self._actor_local.eval() with torch.no_grad(): action = self._actor_local(state).cpu().data.numpy() self._actor_local.train() return np.clip(action, -1, 1) def hard_update(self, local, target): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local, target, tau): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def learn(self, samples): states, actions, rewards, next_states, dones = samples actions_next = self._actor_target(next_states) Q_targets_next = self._critic_target(next_states, actions_next) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) Q_expected = self._critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() actions_pred = self._actor_local(states) actor_loss = -self._critic_local(states, actions_pred).mean() self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() self.soft_update(self._critic_local, self._critic_target, TAU) self.soft_update(self._actor_local, self._actor_target, TAU) def save(self): torch.save(self._actor_local.state_dict(), ACTOR_PATH) torch.save(self._critic_local.state_dict(), CRITIC_PATH) def load(self): self._actor_local.load_state_dict(torch.load(ACTOR_PATH)) self._actor_local.eval() self._critic_local.load_state_dict(torch.load(CRITIC_PATH)) self._critic_local.eval()