class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.name = "DDPG" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_local') self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_target') # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, 'critic_local') self.critic_target = Critic(self.state_size, self.action_size, 'critic_target') # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Reward counter self.total_reward = 0 self.n_steps = 0 def load(self): self.actor_local.load() self.actor_target.load() self.critic_local.load() self.critic_target.load() print("Agent's weights loaded from disk.") def save(self): self.actor_local.save() self.actor_target.save() self.critic_local.save() self.critic_target.save() print("Agent's weights saved to disk.") def reset_episode(self): self.total_reward = 0 self.n_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Add reward to total self.total_reward += reward self.n_steps += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state, add_noise=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Hack, rescale rotor revs to +-5 range from average # rev_mean = np.mean(action) # action = (action-450)/450 # action *= 50 # action += rev_mean if add_noise: action += self.noise.sample() # additive noise for exploration return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent: def __init__(self, env, gamma, gae_lambda, batch_size, lr_rate, ratio_clipping, epochs): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_bound = env.action_space.high[0] self.gamma = gamma self.gae_lambda = gae_lambda self.batch_size = batch_size self.epochs = epochs self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, lr_rate[0], ratio_clipping) self.critic = Critic(self.state_dim, lr_rate[1]) self.save_epi_reward = [] def gae_target(self, rewards, v_values, next_v_value, done): n_step_targets = torch.zeros_like(rewards) gae = torch.zeros_like(rewards) gae_cumulative = 0. forward_val = 0. if not done: forward_val = next_v_value for k in reversed(range(0, len(rewards))): delta = rewards[k] + self.gamma * forward_val - v_values[k] gae_cumulative = self.gamma * self.gae_lambda * gae_cumulative + delta gae[k] = gae_cumulative forward_val = v_values[k] n_step_targets[k] = gae[k] + v_values[k] return gae, n_step_targets def unpack_batch(self, batch): unpack = [] for idx in range(len(batch)): unpack.append(batch[idx]) unpack = torch.cat(unpack, axis=0) return unpack def train(self, max_episode_num, save_path, save_names): batch_state, batch_action, batch_reward = [], [], [] batch_log_old_policy_pdf = [] for episode in range(max_episode_num): time, episode_reward, done = 0, 0, False state = self.env.reset() state = torch.from_numpy(state).type(torch.FloatTensor) while not done: #env.render() mu_old, std_old, action = self.actor.get_policy_action(state) action = np.array([action.item()]) mu_old = np.array([mu_old.item()]) std_old = np.array([std_old.item()]) action = np.clip(action, -self.action_bound, self.action_bound) var_old = std_old**2 log_old_policy_pdf = -0.5 * ( action - mu_old)**2 / var_old - 0.5 * np.log( var_old * 2 * np.pi) log_old_policy_pdf = np.sum(log_old_policy_pdf) next_state, reward, done, _ = self.env.step(action) next_state = torch.from_numpy(next_state).type( torch.FloatTensor) action = torch.from_numpy(action).type(torch.FloatTensor) reward = torch.FloatTensor([reward]) log_old_policy_pdf = torch.FloatTensor([log_old_policy_pdf]) state = state.view(1, self.state_dim) next_state = next_state.view(1, self.state_dim) action = action.view(1, self.action_dim) reward = reward.view(1, 1) log_old_policy_pdf = log_old_policy_pdf.view(1, 1) batch_state.append(state) batch_action.append(action) batch_reward.append((reward + 8) / 8) batch_log_old_policy_pdf.append(log_old_policy_pdf) if len(batch_state) < self.batch_size: state = next_state[0] episode_reward += reward[0] time += 1 continue states = self.unpack_batch(batch_state) actions = self.unpack_batch(batch_action) rewards = self.unpack_batch(batch_reward) log_old_policy_pdfs = self.unpack_batch( batch_log_old_policy_pdf) batch_state, batch_action, batch_reward = [], [], [] batch_log_old_policy_pdf = [] v_values = self.critic.get_value(states) next_v_value = self.critic.get_value(next_state) gaes, y_i = self.gae_target(rewards, v_values, next_v_value, done) for _ in range(self.epochs): self.actor.update(states, actions, gaes, log_old_policy_pdfs) self.critic.update(states, y_i) state = next_state[0] episode_reward += reward[0] time += 1 self.save_epi_reward.append(episode_reward.item()) if len(self.save_epi_reward) < 20: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward)) else: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward[-20:])) if episode % 10 == 0: self.actor.save(save_path, save_names[0]) self.critic.save(save_path, save_names[1])
class Agent: def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_bound = env.action_space.high[0] self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, lr_rate[0], tau) self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau) self.buffer = ReplayBuffer(self.buffer_size) self.save_epi_reward = [] def ou_noise(self, x, rho=0.15, mu=0., dt=1e-1, sigma=0.2, dim=1): rho = torch.FloatTensor([rho]) mu = torch.FloatTensor([mu]) dt = torch.FloatTensor([dt]) return x + rho * (mu - x) * dt + torch.sqrt(dt) * torch.normal( 0., sigma, size=(dim, )) def td_target(self, rewards, q_values, dones): y_k = torch.zeros(q_values.shape) for i in range(q_values.shape[0]): if dones[i]: y_k[i] = rewards[i] else: y_k[i] = rewards[i] + self.gamma * q_values[i] return y_k def train(self, max_episode_num, save_path, save_names): self.actor.update_target_network() self.critic.update_target_network() for episode in range(max_episode_num): time, episode_reward, done = 0, 0, False state = self.env.reset() state = torch.from_numpy(state).type(torch.FloatTensor) pre_noise = torch.zeros(self.action_dim) while not done: #env.render() action = self.actor.predict(state)[0] noise = self.ou_noise(pre_noise, dim=self.action_dim) action = np.array([action.item()]) action = np.clip(action, -self.action_bound, self.action_bound) next_state, reward, done, _ = self.env.step(action) next_state = torch.from_numpy(next_state).type( torch.FloatTensor) action = torch.from_numpy(action).type(torch.FloatTensor) reward = torch.FloatTensor([reward]) train_reward = torch.FloatTensor([(reward + 8) / 8]) state = state.view(1, self.state_dim) next_state = next_state.view(1, self.state_dim) action = action.view(1, self.action_dim) reward = reward.view(1, 1) train_reward = reward.view(1, 1) self.buffer.add_buffer(state, action, train_reward, next_state, done) if self.buffer.buffer_size > 1000: states, actions, rewards, next_states, dones = self.buffer.sample_batch( self.batch_size) actions_ = self.actor.target_predict(next_states) actions_ = actions_.view(next_states.shape[0], self.action_dim) target_qs = self.critic.target_predict( next_states, actions_) y_i = self.td_target(rewards, target_qs, dones) self.critic.train(states, actions, y_i) s_actions = self.actor.predict(states) policy_loss = self.critic.predict(states, s_actions) self.actor.train(policy_loss) self.actor.update_target_network() self.critic.update_target_network() pre_noise = noise state = next_state[0] episode_reward += reward[0] time += 1 self.save_epi_reward.append(episode_reward.item()) if len(self.save_epi_reward) < 20: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward)) else: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward[-20:])) if episode % 10 == 0: self.actor.save(save_path, save_names[0]) self.critic.save(save_path, save_names[1])