def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (Policy) Model self.actor = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic.model.get_weights()) self.actor_target.model.set_weights(self.actor.model.get_weights()) # Noise process self.noise_mean = 0.5 self.noise_decay = 0.2 self.noise_variance = 4 self.noise = OUNoise(self.action_size, self.noise_mean, self.noise_decay, self.noise_variance) # Replay memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters self.best_score = -np.inf self.num_steps = 0
def __init__(self, task, learning_rate_actor=0.0001, learning_rate_critic=0.001, gamma=0.99, tau=0.01, buffer_size=100000, batch_size=64, exploration_mu=0, exploration_theta=0.15, exploration_sigma=0.2): ''' # Arguments task: instance of the Task class learning_rate_actor: learning_rate of the actor network learning_rate_critic: learning_rate of the critc network gamma: discount rate tau: soft update coef. buffer_size: int size of replay buffer batch_size: int size of batch extract from replay buffer ''' ### Task (environment) information ### self.task = task # state space self.state_size = task.state_size # action space self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high ### Actor ### self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate_actor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate_actor) # Initialize target model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) ### Critic ### self.critic_local = Critic(self.state_size, self.action_size, learning_rate_critic) self.critic_target = Critic(self.state_size, self.action_size, learning_rate_critic) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) ### Noise process for exploration ### self.exploration_mu = exploration_mu self.exploration_theta = exploration_theta self.exploration_sigma = exploration_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) ### Replay memory (Experience Replay) ### self.buffer_size = buffer_size self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) ### Algorithm parameters ### self.gamma = gamma self.tau = tau
class DDPG_Agent(): '''Reinforcement Learning agent that learns using Deep Deterministic Policy Gradients(DDPG).''' def __init__(self, task, learning_rate_actor=0.0001, learning_rate_critic=0.001, gamma=0.99, tau=0.01, buffer_size=100000, batch_size=64, exploration_mu=0, exploration_theta=0.15, exploration_sigma=0.2): ''' # Arguments task: instance of the Task class learning_rate_actor: learning_rate of the actor network learning_rate_critic: learning_rate of the critc network gamma: discount rate tau: soft update coef. buffer_size: int size of replay buffer batch_size: int size of batch extract from replay buffer ''' ### Task (environment) information ### self.task = task # state space self.state_size = task.state_size # action space self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high ### Actor ### self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate_actor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate_actor) # Initialize target model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) ### Critic ### self.critic_local = Critic(self.state_size, self.action_size, learning_rate_critic) self.critic_target = Critic(self.state_size, self.action_size, learning_rate_critic) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) ### Noise process for exploration ### self.exploration_mu = exploration_mu self.exploration_theta = exploration_theta self.exploration_sigma = exploration_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) ### Replay memory (Experience Replay) ### self.buffer_size = buffer_size self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.batch_size) ### Algorithm parameters ### self.gamma = gamma self.tau = tau def reset_episode(self): ''' Reset environments. Run this function before starting epsode. ''' self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): ''' One-step experience of the agent. Store a (S_t, A_t, R_t+1, S_t+1) experience tuple to replay-buffer. Update policy function and value function using batch of experience tuples. # Arguments action: A_t reward: R_t+1 next_state: S_t+1 done: boolean flag The indicator whether end of the episode or not. ''' # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): '''Returns actions for given state(s) as per current policy.''' state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """ Update policy and value parameters using given batch of experience tuples. """ # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def __init__(self, task, prioritized_replay=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.15 #0.1 self.exploration_sigma = 0.2 #0.2 #0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 # 64 self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = 0.6 self.prioritized_replay_beta0 = 0.4 self.prioritized_replay_beta_iters = None self.prioritized_replay_eps = 1e-6 self.max_timesteps = 100000 # Replay buffer if self.prioritized_replay: self.memory = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters #self.tau = 0.001 # 0.001 per paper self.td_errors_list = [] self.actor_loss_list = [] self.critic_loss_list = []
class DDPG(): """ Reinforcement Learning agent that learns using DDPG. Deep DPG as described by Lillicrap et al. (2015) """ def __init__(self, task, prioritized_replay=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.15 #0.1 self.exploration_sigma = 0.2 #0.2 #0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 # 64 self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = 0.6 self.prioritized_replay_beta0 = 0.4 self.prioritized_replay_beta_iters = None self.prioritized_replay_eps = 1e-6 self.max_timesteps = 100000 # Replay buffer if self.prioritized_replay: self.memory = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters #self.tau = 0.001 # 0.001 per paper self.td_errors_list = [] self.actor_loss_list = [] self.critic_loss_list = [] def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: if self.prioritized_replay: samples = self.memory.sample(self.batch_size, beta=self.beta_schedule.value( len(self.memory))) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = samples experiences = [] for i in range(len(obses_t)): experiences.append( namedtuple("PrioritizedExperience", field_names=[ "state", "action", "reward", "next_state", "done", "weight", "batch_idx" ])(obses_t[i:i + 1], actions[i:i + 1], rewards[i:i + 1], obses_tp1[i:i + 1], dones[i:i + 1], weights[i:i + 1], batch_idxes[i:i + 1])) self.learn(experiences) else: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] #actions = list(action + self.noise.sample()) #print("act {}".format(actions)) #return actions # add some noise for exploration return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) using action gradients action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) actor_loss = self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.prioritized_replay: # Update replay buffer priorities batch_idxes = np.vstack( [e.batch_idx[0] for e in experiences if e is not None]) new_priorities = np.abs(Q_targets) + self.prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities) self.td_errors_list.append(Q_targets.T) self.actor_loss_list.append(actor_loss[0]) self.critic_loss_list.append(critic_loss) #print("states {} next states {} critic_loss {} actor_loss {}".format(states, actions_next, critic_loss, actor_loss)) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def save_weights(self): self.actor_local.model.save_weights("DDPG_actor_weights.h5") self.critic_local.model.save_weights("DDPG_critic_weights.h5") def save_td_errors(self, i_episode): with open("DDPG_agent_td_errors_episode_{}.csv".format(i_episode), 'w') as csvfile: writer = csv.writer(csvfile) for td_errors in self.td_errors_list: writer.writerow([td_errors]) self.td_errors_list.clear() def save_losses(self, i_episode): with open( "DDPG_agent_actor_critic_loss_episode_{}.csv".format( i_episode), 'w') as csvfile: writer = csv.writer(csvfile) for actor_loss, critic_loss in zip(self.actor_loss_list, self.critic_loss_list): writer.writerow([actor_loss, critic_loss]) self.actor_loss_list.clear() self.critic_loss_list.clear()
class Agent(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (Policy) Model self.actor = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic.model.get_weights()) self.actor_target.model.set_weights(self.actor.model.get_weights()) # Noise process self.noise_mean = 0.5 self.noise_decay = 0.2 self.noise_variance = 4 self.noise = OUNoise(self.action_size, self.noise_mean, self.noise_decay, self.noise_variance) # Replay memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters self.best_score = -np.inf self.num_steps = 0 def reset_episode(self): if self.get_score() > self.best_score: self.best_score = self.get_score() self.noise.reset() self.last_state = self.task.reset() self.total_reward = 0.0 self.num_steps = 0 return self.last_state def get_score(self): return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps def step(self, action, reward, next_state, done): self.total_reward += reward self.num_steps += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor.model.predict(state)[0] action = list(action + self.noise.sample()) # add some noise for exploration return action def learn(self, experiences): # print('Learning phase') """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic.model, self.critic_target.model) self.soft_update(self.actor.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)