class A2C: def __init__(self, env_name="BipedalWalker-v2", num_steps=5, num_workers=10, num_updates=10000, log_frequency=10, use_gae=True, gamma=0.99, tau=0.95, entropy_coef=0.01): observation_space, action_space = get_env_info(env_name) self.num_steps = num_steps self.num_updates = num_updates self.log_frequency = log_frequency self.use_gae = use_gae self.gamma = gamma self.tau = tau self.entropy_coef = entropy_coef self.max_grad_norm = 0.5 self.simulator = RolloutCollector(env_name, num_workers) self.eval_env = gym.make(env_name) self.obs_dim, self.action_dim = observation_space.shape[ 0], action_space.shape[0] self.storage = RolloutStorage(num_steps, num_workers, observation_space.shape, action_space) self.policy = Actor(self.obs_dim, self.action_dim) self.V = Critic(self.obs_dim) self.actor_optimizer = optim.Adam(self.policy.parameters(), lr=5e-4) self.critic_optimizer = optim.Adam(self.V.parameters(), lr=5e-4) # track statistics self.episode_count = 0 def get_actions(self, obs_n): with torch.no_grad(): obs_batch = torch.FloatTensor(np.stack(obs_n)) dist = self.policy(obs_batch) action_sample = dist.sample() values = self.V(obs_batch) action_n = [ action_sample[i].numpy() for i in range(len(action_sample)) ] return action_n, action_sample, values def update_storage(self, obs, actions, rewards, values, dones): self.episode_count += torch.sum(dones).item() masks = 1 - dones self.storage.insert(obs, actions, values, rewards, masks) def set_initial_observations(self, observations): self.storage.obs[0].copy_(observations) def compute_advantages(self): advantages = self.storage.returns[:-1] - self.storage.values[:-1] # standardize the advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) return advantages def update(self): with torch.no_grad(): next_value = self.V(self.storage.obs[-1]) self.storage.compute_returns(next_value, self.use_gae, self.gamma, self.tau) self.storage.returns.mul_(0.1) advantages = self.compute_advantages() obs_batch, actions_batch, values_batch, return_batch, adv_targ = self.storage.build_batch( advantages) # Update the policy self.actor_optimizer.zero_grad() action_dist = self.policy(obs_batch) action_log_probs = action_dist.log_prob(actions_batch) objective = torch.mean(adv_targ * action_log_probs) policy_loss = -objective # compute the value loss self.critic_optimizer.zero_grad() value_loss = F.mse_loss(self.V(obs_batch), return_batch) # compute other losses entropy_loss = -torch.mean(action_dist.entropy()) # sum the losses, backprop, and step net_loss = policy_loss + value_loss + self.entropy_coef * entropy_loss net_loss.backward() nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) nn.utils.clip_grad_norm_(self.V.parameters(), self.max_grad_norm) self.critic_optimizer.step() self.actor_optimizer.step() return value_loss.detach().item( ), -policy_loss.detach().item(), -entropy_loss.detach().item() def evaluate(self, n=20, render=False): env = self.eval_env action_bounds = [env.action_space.low, env.action_space.high] all_rewards = [] for i in range(n): episode_rewards = [] state = env.reset() terminal = False while not terminal: dist = self.policy(torch.FloatTensor(state).view(1, -1)) action = dist.sample().numpy().reshape(-1) action = np.clip(action, action_bounds[0], action_bounds[1]) next_state, reward, terminal, info = env.step(action) episode_rewards.append(reward) state = next_state if render: fps = 8.0 env.render() time.sleep(1 / fps) all_rewards.append(np.sum(episode_rewards)) all_rewards = np.array(all_rewards) env.reset() return all_rewards def __iter__(self): obs_n = self.simulator.reset() for u in range(self.num_updates): self.set_initial_observations(torch.FloatTensor(np.stack(obs_n))) for t in range(self.num_steps): # Compute actions using policy given latest observation action_n, actions, values = self.get_actions(obs_n) # Give action to each worker and take an environment step obs_n, reward_n, done_n = self.simulator.step(action_n) observations = torch.FloatTensor(np.stack(obs_n)) rewards = torch.FloatTensor(np.vstack(reward_n)) dones = torch.FloatTensor(np.vstack(done_n)) # Update the storage self.update_storage(observations, actions, rewards, values, dones) value_loss, objective, mean_policy_entropy = self.update() self.storage.after_update() if (u + 1) % self.log_frequency == 0: eval_episode_returns = self.evaluate() yield self.episode_count, eval_episode_returns, value_loss, objective, mean_policy_entropy
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, seed=args.seed, digit=args.digit, rank=i, log_dir=args.log_dir, use_patience=args.use_patience) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) print(obs_shape) actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_lengths = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks episode_lengths += torch.ones(episode_lengths.size()) episode_lengths *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic.get_value( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, Episode lengths {:.2f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0], episode_lengths.mean())) if j > 0 and j % args.vis_interval == 0: pass
def run(self): # (16, 4, 84, 84) current_obs = np.zeros([NUM_PROCESSES, *self.obs_shape]) episode_rewards = np.zeros([NUM_PROCESSES, 1]) final_rewards = np.zeros([NUM_PROCESSES, 1]) # torch.Size([16, 1, 84, 84]) obs = self.env.reset() # frameの先頭に最新のobsを格納 current_obs[:, :1] = obs storage = RolloutStorage(NUM_ADVANCED_STEP, NUM_PROCESSES, self.obs_shape, current_obs) for j in tqdm(range(NUM_UPDATES)): for step in range(NUM_ADVANCED_STEP): #with torch.no_grad(): _, cpu_actions = self.actor_critic.predict( storage.observations[step] / 255) action = np.argmax(np.array( [np.random.multinomial(1, x) for x in cpu_actions]), axis=1) # obs size:(16, 1, 84, 84) obs, reward, done, info = self.env.step(action) reward = reward.reshape(-1, 1) episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards[done] = 0 # 現在の状態をdone時には全部0にする current_obs[done] = 0 # frameをstackする current_obs[:, 1:] = current_obs[:, :-1] # 2~4番目に1~3番目を上書き current_obs[:, :1] = obs # 1番目に最新のobsを格納 # メモリオブジェクトに今stepのtransitionを挿入 storage.insert(current_obs, action, reward, done) # advancedした最終stepの状態から予想する状態価値を計算 #with torch.no_grad(): input_obs = storage.observations[-1] / 255 next_value, _ = self.actor_critic.predict(input_obs) # 全stepの割引報酬和returnsを計算 storage.compute_discounted_rewards(next_value) # ネットワークとstorageの更新 self.global_brain.update(storage) storage.after_update() # ログ:途中経過の出力 if j % 100 == 0: print( "finished frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j * NUM_PROCESSES * NUM_ADVANCED_STEP, final_rewards.mean(), np.median(final_rewards), final_rewards.min(), final_rewards.max())) # 結合パラメータの保存 if j % 12500 == 0: self.actor_critic.save('weight_' + str(j) + '.pth') # 実行ループの終了 self.actor_critic.save('weight_end.pth')