class Agent(): """ Class implementation of a so-called "intelligent" agent. This agent interacts with and learns from the environment. This agent employs the DDPG algorithm to solve this problem. """ # actor_local = None # actor_target = None # actor_optimizer = None """ Class-level Actor properties. """ # critic_local = None # critic_target = None # critic_optimizer = None """ Class-level Critic properties. """ # memory = None """ Class-level memory variable. """ def __init__(self, state_size, action_size, seed, add_noise=True): """ Initialize an Agent instance. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed add_noise (bool): Toggle for using the stochastic process """ # Set the parameters. self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Setting the Actor network (with the Target Network). self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) # Optimize the Actor using Adam. self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Setting the Critic network (with the Target Network). self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) # Optimize the Critic using Adam. self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Set up noise processing. if add_noise: self.noise = Noise((20, action_size), seed) # Use the Replay memory buffer (once per class). self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize the time step (until max NUM_TIME_STEPS is reached). # self.t_step = 0 def step(self, time_step, states, actions, rewards, next_states, dones): """ Update the network on each step. In other words, save the experience in replay memory, and then use random sampling from the buffer to learn. """ # Save experience in replay memory. for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every time step till NUM_TIME_STEPS is reached. # if time_step % NUM_TIME_STEPS != 0: # return # Save the experience in replay memory, then use random sampling from the buffer to learn. self.sample_and_learn() def sample_and_learn(self): """ For a specified number of agents, use random sampling from the buffer to learn. """ # If enough samples are available in memory, get random subset and learn. if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) # for _ in range(NUM_LEARN_UPDATES): # experiences = Agent.memory.sample() # self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """ Return the actions for a given state as per current policy. Params ====== state (array_like): Current state add_noise (bool): Toggle for using the stochastic process """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # If the stochastic process is enabled. if add_noise: action += self.noise.sample() # Return the action. return np.clip(action, -1, 1) def reset(self): """ Reset the state. """ # Reset the internal state (noise) to mean (mu). self.noise.reset() def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples. i.e., Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where actor_target(state) -> action, and critic_target(state, action) -> Q-value. Params ====== experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples gamma (float): Discount factor """ # Set the parameters. states, actions, rewards, next_states, dones = experiences """ Update the Critic. """ # Get the predicted next-state actions and Q-values from the target models. # Calculate the pair action/reward for each of the next_states. actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q-targets for the current states, (y_i). Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute the Critic loss. Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss. self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() """ Update the Actor. """ # Compute the Actor loss. actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss. self.actor_optimizer.zero_grad() # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) actor_loss.backward() self.actor_optimizer.step() """ Update the target networks. """ self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. i.e., θ_target = τ * θ_local + (1 - τ) * θ_target. Params ====== local_model (PyTorch model): Weights will be copied from target_model (PyTorch model): Weights will be copied to tau (float): Interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
class Christophers_Agent(): def __init__(self, task): # Task (environment) information self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=( self.state_size, self.action_size ), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size) )) # start producing actions in a decent range self.actor = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic = Critic(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_target = Critic(self.state_size, self.action_size) self.gamma = 0.95 self.tau = 0.001 self.best_w = None self.best_score = -np.inf self.exploration_mu = 0.5 self.exploration_theta = 0.2 self.exploration_sigma = 0.4 self.noise = Noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 32 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_score = -np.inf self.num_steps = 0 # Episode variables self.reset_episode() def reset_episode(self): if self.get_score() > self.best_score: self.best_score = self.get_score() self.total_reward = 0.0 self.num_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.num_steps += 1 self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor.model.predict(state)[0] action = list(action + self.noise.sample()) # add some noise for exploration return action def get_score(self): return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) done = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - done) self.critic.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor.train_fn([states, action_gradients, 1]) self.soft_update(self.critic.model, self.critic_target.model) self.soft_update(self.actor.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class PolicySearch_Agent(): def __init__(self, task): self.task=task self.state_size=task.state_size self.action_size=task.action_size self.action_low=task.action_low self.action_high=task.action_high self.actor_local=Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target=Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local=Critic(self.state_size, self.action_size) self.critic_target=Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.mu=0 self.theta=0.2 self.sigma=0.005 # random noise self.noise=Noise(self.action_size, self.mu, self.theta, self.sigma) self.gamma=0.9 self.tau=0.1 self.best_score=-np.inf self.score=0 self.buffer_size=100000 self.batch_size=64 self.memory=ReplayBuffer(self.buffer_size, self.batch_size) def reset_episode(self): self.noise.reset() state=self.task.reset() self.last_state=state self.score=0 return state def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences=self.memory.sample() self.learn(experiences) self.last_state=next_state self.score+=reward if done: if self.score > self.best_score: self.best_score=self.score def act(self, states): state=np.reshape(states, [-1, self.state_size]) action=self.actor_local.model.predict(state)[0] return list(action+self.noise.sample()) def learn(self, experiences): states=np.vstack([e.state for e in experiences if e is not None]) actions=np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards=np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones=np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states=np.vstack([e.next_state for e in experiences if e is not None]) actions_next=self.actor_target.model.predict_on_batch(next_states) Q_values_next=self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_values=rewards+self.gamma*Q_values_next*(1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_values) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.update(self.critic_local.model, self.critic_target.model) self.update(self.actor_local.model, self.actor_target.model) def update(self, local_model, target_model): local_weights=np.array(local_model.get_weights()) target_weights=np.array(target_model.get_weights()) assert len(local_weights)==len(target_weights) new_weights=self.tau*local_weights+(1-self.tau)*target_weights target_model.set_weights(new_weights)