class Worker(mp.Process): def __init__(self, process_num, global_model, params): super().__init__() self.process_num = process_num self.global_model = global_model self.params = params env = gym.make('CarRacing-v0') self.environment = EnvironmentWrapper(env, self.params.stack_size) self.model = ActorCritic(self.params.stack_size, get_action_space()) self.optimizer = Adam(self.global_model.parameters(), lr=self.params.lr) self.storage = Storage(self.params.steps_per_update) self.current_observation = torch.zeros( 1, *self.environment.get_state_shape()) def run(self): num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observation = torch.Tensor([self.environment.reset()]) for update in range(int(num_of_updates)): self.storage.reset_storage() # synchronize with global model self.model.load_state_dict(self.global_model.state_dict()) for step in range(self.params.steps_per_update): probs, log_probs, value = self.model(self.current_observation) action = get_actions(probs)[0] action_log_prob, entropy = self.compute_action_log_and_entropy( probs, log_probs) state, reward, done = self.environment.step(action) if done: state = self.environment.reset() done = torch.Tensor([done]) self.current_observation = torch.Tensor([state]) self.storage.add(step, value, reward, action_log_prob, entropy, done) _, _, last_value = self.model(self.current_observation) expected_reward = self.storage.compute_expected_reward( last_value, self.params.discount_factor) advantages = torch.tensor(expected_reward) - self.storage.values value_loss = advantages.pow(2).mean() if self.params.use_gae: gae = self.storage.compute_gae(last_value, self.params.discount_factor, self.params.gae_coef) policy_loss = -(torch.tensor(gae) * self.storage.action_log_probs).mean() else: policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss # loss = torch.abs(loss) loss.backward() nn.utils.clip_grad_norm(self.model.parameters(), self.params.max_norm) self._share_gradients() self.optimizer.step() if update % 20 == 0: print('Process: {}. Update: {}. Loss: {}'.format( self.process_num, update, loss)) def compute_action_log_and_entropy(self, probs, log_probs): values, indices = probs.max(1) indices = indices.view(-1, 1) action_log_prob = log_probs.gather(1, indices) entropy = -(log_probs * probs).sum(-1) return action_log_prob, entropy def _share_gradients(self): for local_param, global_param in zip(self.model.parameters(), self.global_model.parameters()): global_param._grad = local_param.grad
class A2CTrainer: def __init__(self, params, model_path): self.params = params self.model_path = model_path self.num_of_processes = multiprocessing.cpu_count() self.parallel_environments = ParallelEnvironments( self.params.stack_size, number_of_processes=self.num_of_processes) self.actor_critic = ActorCritic(self.params.stack_size, get_action_space()) self.optimizer = Adam(self.actor_critic.parameters(), lr=self.params.lr) self.storage = Storage(self.params.steps_per_update, self.num_of_processes) self.current_observations = torch.zeros( self.num_of_processes, *self.parallel_environments.get_state_shape()) def run(self): # num of updates per environment num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observations = self.parallel_environments.reset() print(self.current_observations.size()) for update in range(int(num_of_updates)): self.storage.reset_storage() for step in range(self.params.steps_per_update): probs, log_probs, value = self.actor_critic( self.current_observations) actions = get_actions(probs) action_log_probs, entropies = self.compute_action_logs_and_entropies( probs, log_probs) states, rewards, dones = self.parallel_environments.step( actions) rewards = rewards.view(-1, 1) dones = dones.view(-1, 1) self.current_observations = states self.storage.add(step, value, rewards, action_log_probs, entropies, dones) _, _, last_values = self.actor_critic(self.current_observations) expected_rewards = self.storage.compute_expected_rewards( last_values, self.params.discount_factor) advantages = torch.tensor(expected_rewards) - self.storage.values value_loss = advantages.pow(2).mean() policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss loss.backward(retain_graph=True) nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.params.max_norm) self.optimizer.step() if update % 300 == 0: torch.save(self.actor_critic.state_dict(), self.model_path) if update % 100 == 0: print('Update: {}. Loss: {}'.format(update, loss)) def compute_action_logs_and_entropies(self, probs, log_probs): values, indices = probs.max(1) indices = indices.view(-1, 1) action_log_probs = log_probs.gather(1, indices) entropies = -(log_probs * probs).sum(-1) return action_log_probs, entropies
class Worker(mp.Process): def __init__(self, process_num, global_model, params, autosave=False): # CHANGE super().__init__() self.process_num = process_num self.global_model = global_model self.params = params env = gym.make('CarRacing-v0') self.environment = EnvironmentWrapper(env, self.params.stack_size) self.model = ActorCritic(self.params.stack_size, get_action_space()) self.optimizer = Adam(self.global_model.parameters(), lr=self.params.lr) self.storage = Storage(self.params.steps_per_update) self.current_observation = torch.zeros( 1, *self.environment.get_state_shape()) #NEW: self.lr = self.params.lr self.autosave = autosave self.log_loss = [] self.log_tmp = np.array([]) self.log_reward = np.array([]) def run(self): num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observation = torch.Tensor([self.environment.reset()]) reward_episode = 0 #NEW for update in range(int(num_of_updates)): self.storage.reset_storage() # synchronize with global model self.model.load_state_dict(self.global_model.state_dict()) for step in range(self.params.steps_per_update): probs, log_probs, value = self.model(self.current_observation) action = get_actions(probs)[0] action_log_prob, entropy = self.compute_action_log_and_entropy( probs, log_probs) state, reward, done = self.environment.step(action) reward_episode += reward # NEW if done: self.log_reward = np.append(self.log_reward, reward_episode) # NEW print('Process: {}. Episode {} score: {}.'.format( self.process_num, len(self.log_reward) - 1, self.log_reward[-1])) # NEW reward_episode = 0 # NEW state = self.environment.reset() done = torch.Tensor([done]) self.current_observation = torch.Tensor([state]) self.storage.add(step, value, reward, action_log_prob, entropy, done) _, _, last_value = self.model(self.current_observation) expected_reward = self.storage.compute_expected_reward( last_value, self.params.discount_factor) advantages = torch.tensor(expected_reward) - self.storage.values value_loss = advantages.pow(2).mean() if self.params.use_gae: gae = self.storage.compute_gae(last_value, self.params.discount_factor, self.params.gae_coef) policy_loss = -(torch.tensor(gae) * self.storage.action_log_probs).mean() else: policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss if self.autosave: # NEW self.log_tmp = np.append(self.log_tmp, loss.detach().numpy()) # NEW loss.backward() nn.utils.clip_grad_norm(self.model.parameters(), self.params.max_norm) self._share_gradients() self.optimizer.step() #NEW if update % (int(num_of_updates / 50)) == 0: self.lr *= 0.85 self.optimizer = Adam(self.global_model.parameters(), lr=self.lr) print('Process: {}. Learning rate: {}.'.format( self.process_num, self.lr)) if update % 60 == 0: print('Process: {}. Update: {}. Loss: {}'.format( self.process_num, update, loss)) # NEW if self.autosave: torch.save(self.global_model.state_dict(), 'models/a3c{}.pt'.format(update)) self.log_loss.append(np.mean(self.log_tmp)) self.log_tmp = np.array([]) print("LOSS OF 60 UPDATEs: {}".format(self.log_loss[-1])) def compute_action_log_and_entropy(self, probs, log_probs): values, indices = probs.max(1) indices = indices.view(-1, 1) action_log_prob = log_probs.gather(1, indices) entropy = -(log_probs * probs).sum(-1) return action_log_prob, entropy def _share_gradients(self): for local_param, global_param in zip(self.model.parameters(), self.global_model.parameters()): global_param._grad = local_param.grad