def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
class PPOAgent(ResearchAgent): """The TensorForceAgent. Acts through the algorith, not here.""" def __init__(self, actor_critic, character=characters.Bomber, **kwargs): self._actor_critic = actor_critic super(PPOAgent, self).__init__(character, **kwargs) def cuda(self): self._actor_critic.cuda() if hasattr(self, "_rollout"): self._rollout.cuda() @property def model(self): return self._actor_critic @property def optimizer(self): return self._optimizer def set_eval(self): self._actor_critic.eval() def set_train(self): self._actor_critic.train() def _rollout_data(self, step, num_agent, num_agent_end=None): if num_agent_end is not None: assert (num_agent_end > num_agent) observations = Variable( self._rollout.observations[step, num_agent:num_agent_end]) states = Variable(self._rollout.states[step, num_agent:num_agent_end]) masks = Variable(self._rollout.masks[step, num_agent:num_agent_end]) else: observations = Variable(self._rollout.observations[step, num_agent], volatile=True) states = Variable(self._rollout.states[step, num_agent], volatile=True) masks = Variable(self._rollout.masks[step, num_agent], volatile=True) return observations, states, masks def actor_critic_act(self, step, num_agent=0, deterministic=False): """Uses the actor_critic to take action. Args: step: The int timestep that we are acting. num_agent: Agent id that's running. Non-zero when agent has copies. Returns: See the actor_critic's act function in model.py. """ # NOTE: Training uses this --> it uses act(..., deterministic=False). return self._actor_critic.act(*self.get_rollout_data(step, num_agent), deterministic=deterministic) def get_rollout_data(self, step, num_agent, num_agent_end=None): return self._rollout_data(step, num_agent, num_agent_end) def actor_critic_call(self, step, num_agent=0): observations, states, masks = self._rollout_data(step, num_agent) return self._actor_critic(observations, states, masks)[0].data def _evaluate_actions(self, observations, states, masks, actions): return self._actor_critic.evaluate_actions(observations, states, masks, actions) def _optimize(self, value_loss, action_loss, dist_entropy, entropy_coef, value_loss_coef, max_grad_norm, kl_loss=None, kl_factor=0, only_value_loss=False, add_nonlin=False): self._optimizer.zero_grad() # only update the value head (to be used when fine tuning a model # trained with BC without a value predictor) -- only beginning of finetuning if only_value_loss: loss = value_loss * value_loss_coef # stop the gradients from flowing through the # parameters that are used to compute the actions (i.e. critic / policy head) # and only backprop the value loss through the value head #(i.e. parameters used exclusively to predict the value) for p in self._actor_critic.parameters(): p.requires_grad = False self._actor_critic.critic_linear.requires_grad = True if add_nonlin: self._actor_critic.fc_critic.requires_grad = True loss.backward() else: loss = value_loss * value_loss_coef + action_loss \ - dist_entropy * entropy_coef if kl_factor > 0 and not use_is: loss += kl_factor * kl_loss loss.backward() nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm) self._optimizer.step() if hasattr(self, '_scheduler'): self._scheduler.step(loss) if only_value_loss: for p in self._actor_critic.parameters(): p.requires_grad = True def halve_lr(self): for i, param_group in enumerate(self._optimizer.param_groups): old_lr = float(param_group['lr']) new_lr = max(old_lr * 0.5, 1e-7) param_group['lr'] = new_lr def compute_advantages(self, next_value_agents, use_gae, gamma, tau): for num_agent, next_value in enumerate(next_value_agents): self._rollout.compute_returns(next_value, use_gae, gamma, tau, num_agent) advantages = self._rollout.compute_advantages() diff = (advantages - advantages.mean()) advantages = diff / (advantages.std() + 1e-5) return advantages def initialize(self, args, obs_shape, action_space, num_training_per_episode, num_episodes, total_steps, num_epoch, optimizer_state_dict, num_steps, uniform_v, uniform_v_prior): params = self._actor_critic.parameters() self._optimizer = optim.Adam(params, lr=args.lr, eps=args.eps) if optimizer_state_dict: self._optimizer.load_state_dict(optimizer_state_dict) if args.use_lr_scheduler: self._scheduler = optim.lr_scheduler.ReduceLROnPlateau( self._optimizer, mode='min', verbose=True) self._rollout = RolloutStorage(num_steps, args.num_processes, obs_shape, action_space, self._actor_critic.state_size, num_training_per_episode) self.num_episodes = num_episodes self.total_steps = total_steps self.num_epoch = num_epoch self.uniform_v = uniform_v self.uniform_v_prior = uniform_v_prior def update_rollouts(self, obs, timestep): self._rollout.observations[timestep, :, :, :, :, :].copy_(obs) def insert_rollouts(self, step, current_obs, states, action, action_log_prob, value, reward, mask, action_log_prob_distr=None, dagger_prob_distr=None, expert_action_log_prob=None, training_action_log_prob=None): self._rollout.insert(step, current_obs, states, action, action_log_prob, value, reward, mask, action_log_prob_distr, dagger_prob_distr, expert_action_log_prob=None, training_action_log_prob=None) def ppo(self, advantages, num_mini_batch, batch_size, num_steps, clip_param, entropy_coef, value_loss_coef, max_grad_norm, action_space, anneal=False, lr=1e-4, eps=1e-5, kl_factor=0, only_value_loss=False, add_nonlin=False, use_is=False, use_retrace=False, lambda_retrace=1.0): action_losses = [] value_losses = [] dist_entropies = [] kl_losses = [] kl_loss = None total_losses = [] if hasattr(self._actor_critic, 'gru'): data_generator = self._rollout.recurrent_generator( advantages, num_mini_batch, batch_size, num_steps, kl_factor, use_is) else: data_generator = self._rollout.feed_forward_generator( advantages, num_mini_batch, batch_size, num_steps, action_space, kl_factor, use_is) for sample in data_generator: observations_batch, states_batch, actions_batch, return_batch, \ masks_batch, old_action_log_probs_batch, adv_targ, \ action_log_probs_distr_batch, dagger_probs_distr_batch, \ expert_action_log_probs_batch, training_action_log_probs_batch \ = sample # Reshape to do in a single forward pass for all steps result = self._evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) values, action_log_probs, dist_entropy, states = result adv_targ = Variable(adv_targ) ratio = action_log_probs ratio -= Variable(old_action_log_probs_batch) ratio = torch.exp(ratio) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) surr2 *= adv_targ action_loss = -torch.min(surr1, surr2).mean() value_loss = (Variable(return_batch) - values) \ .pow(2).mean() total_loss = value_loss * value_loss_coef + action_loss \ - dist_entropy * entropy_coef if kl_factor > 0 and not use_is: criterion = nn.KLDivLoss() kl_loss = criterion(Variable(action_log_probs_distr_batch), Variable(dagger_probs_distr_batch)) total_loss += kl_factor * kl_loss self._optimize(value_loss, action_loss, dist_entropy, entropy_coef, value_loss_coef, max_grad_norm, kl_loss, kl_factor, only_value_loss, add_nonlin) lr = self._optimizer.param_groups[0]['lr'] action_losses.append(action_loss.data[0]) value_losses.append(value_loss.data[0]) dist_entropies.append(dist_entropy.data[0]) if kl_factor > 0 and not use_is: kl_losses.append(kl_loss.data[0]) total_losses.append(total_loss.data[0]) return action_losses, value_losses, dist_entropies, \ kl_losses, total_losses, lr def copy_ex_model(self): """Creates a copy without the model. This is for operating with homogenous training.""" return PPOAgent(None, self._character, num_processes=self._num_processes) def copy_with_model(self): """Creates a copy with the model. This is for operating with frozen backplay.""" return PPOAgent(self._actor_critic, self._character, num_processes=self._num_processes) def after_epoch(self): self._rollout.after_epoch() def set_new_model(self, model, cuda=False): self._actor_critic = model if cuda: self._actor_critic.cuda()
def main(): print("######") print("HELLO! Returns start with infinity values") print("######") os.environ['OMP_NUM_THREADS'] = '1' if args.random_task: env_params = { 'wt': np.round(np.random.uniform(0.5, 1.0), 2), 'x': np.round(np.random.uniform(-0.1, 0.1), 2), 'y': np.round(np.random.uniform(-0.1, 0.1), 2), 'z': np.round(np.random.uniform(0.15, 0.2), 2), } else: env_params = { 'wt': args.euclidean_weight, 'x': args.goal_x, 'y': args.goal_y, 'z': args.goal_z, } envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, ob=False) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() actor_critic.input_norm.update(rollouts.observations[0]) last_return = -np.inf best_return = -np.inf best_models = None start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) actor_critic.input_norm.update(rollouts.observations[step + 1]) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if args.vis and j % args.vis_interval == 0: last_return = plot(logger, args.log_dir) if last_return > best_return: best_return = last_return try: os.makedirs(os.path.dirname(args.save_path)) except OSError: pass info = { 'return': best_return, 'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon) } # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save((save_model, env_params, info), args.save_path) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), last_return, best_return, value_loss.data[0], action_loss.data[0]))
class PPOAgent(BaseAgent): """The TensorForceAgent. Acts through the algorith, not here.""" def __init__(self, agent, actor_critic): self._actor_critic = actor_critic self._agent = agent def cuda(self): self._actor_critic.cuda() self._rollout.cuda() def get_model(self): return self._actor_critic def get_optimizer(self): return self._optimizer def act(self, obs, action_space): """This agent has its own way of inducing actions.""" return None def act_pytorch(self, step, num_agent=0): """Uses the actor_critic to act. Args: step: The int timestep that we are acting. num_agent: The agent id that we are acting. This is non zero when this agent has copies. Returns: See the actor_critic's act function in model.py. """ return self._actor_critic.act( Variable(self._rollout.observations[step, num_agent], volatile=True), Variable(self._rollout.states[step, num_agent], volatile=True), Variable(self._rollout.masks[step, num_agent], volatile=True)) def run_actor_critic(self, step, num_agent=0): return self._actor_critic( Variable(self._rollout.observations[step][num_agent], volatile=True), Variable(self._rollout.states[step][num_agent], volatile=True), Variable(self._rollout.masks[step][num_agent], volatile=True))[0].data def evaluate_actions(self, observations, states, masks, actions): return self._actor_critic.evaluate_actions(observations, states, masks, actions) def optimize(self, value_loss, action_loss, dist_entropy, entropy_coef, max_grad_norm): self._optimizer.zero_grad() (value_loss + action_loss - dist_entropy * entropy_coef).backward() nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm) self._optimizer.step() def compute_advantages(self, next_value_agents, use_gae, gamma, tau): for num_agent, next_value in enumerate(next_value_agents): self._rollout.compute_returns(next_value, use_gae, gamma, tau, num_agent) advantages = self._rollout.compute_advantages() advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) return advantages def initialize(self, args, obs_shape, action_space, num_training_per_episode): self._optimizer = optim.Adam(self._actor_critic.parameters(), args.lr, eps=args.eps) self._rollout = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, self._actor_critic.state_size, num_training_per_episode) def update_rollouts(self, obs, timestep): self._rollout.observations[timestep, :, :, :, :, :].copy_(obs) def insert_rollouts(self, step, current_obs, states, action, action_log_prob, value, reward, mask): self._rollout.insert(step, current_obs, states, action, action_log_prob, value, reward, mask) def feed_forward_generator(self, advantage, args): return self._rollout.feed_forward_generator(advantage, args) def copy(self, agent): # NOTE: Ugh. This is bad. return PPOAgent(agent, None) def after_update(self): self._rollout.after_update()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") print(args) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) for gamma in args.gamma: with open(args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv', "wt") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow([ 'update', 'error', str(int(args.num_frames) // args.num_steps) ]) os.environ['OMP_NUM_THREADS'] = '1' print("Using env {}".format(args.env_name)) envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) num_heads = len( args.gamma) if not args.reward_predictor else len(args.gamma) - 1 if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, num_heads=num_heads, hidden_size=args.hidden_size) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space, num_heads=num_heads, reward_predictor=args.reward_predictor, use_s=args.use_s, use_s_a=args.use_s_a, use_s_a_sprime=args.use_s_a_sprime) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() lrs = [args.lr] * len(actor_critic.param_groups) if not args.reward_predictor: assert len(actor_critic.param_groups) == len(lrs) model_params = [{ 'params': model_p, 'lr': args.lr } for model_p, lr in zip(actor_critic.param_groups, lrs)] else: model_params = [{ 'params': model_p, 'lr': p_lr } for model_p, p_lr in zip(actor_critic.param_groups[:-1], lrs)] model_params.append({ 'params': actor_critic.param_groups[-1], 'lr': args.lr_rp }) if args.algo == 'a2c': optimizer = optim.RMSprop(model_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(model_params, args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, gamma=args.gamma, use_rp=args.reward_predictor) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs, obs_tensor): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: obs_tensor[:, :-shape_dim0] = obs_tensor[:, shape_dim0:] obs_tensor[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs, current_obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) advantages_list = [] if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions = add_gaussian_noise(cpu_actions, args.action_noise) # Obser reward and next obs obs, raw_reward, done, info = envs.step(cpu_actions) reward = np.copy(raw_reward) reward = add_gaussian_noise(reward, args.reward_noise) reward = epsilon_greedy(reward, args.reward_epsilon, args.reward_high, args.reward_low) raw_reward = torch.from_numpy( np.expand_dims(np.stack(raw_reward), 1)).float() episode_rewards += raw_reward reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs) if args.reward_predictor: r_hat = actor_critic.predict_reward( Variable(rollouts.observations[step], volatile=True), action, Variable(current_obs, volatile=True)) p_hat = min(args.rp_burn_in, j) / args.rp_burn_in estimate_reward = (1 - p_hat) * reward + p_hat * r_hat.data.cpu() reward = torch.cat([reward, estimate_reward], dim=-1) value = torch.cat([r_hat, value], dim=-1).data else: value = value.data rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value, reward, masks, raw_reward) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data if args.reward_predictor: if args.use_s or args.use_s_a: r_hat = actor_critic.predict_reward( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.actions[-1], volatile=True), None).data next_value = torch.cat([r_hat, next_value], dim=-1) else: next_value = torch.cat([ torch.zeros(list(next_value.size())[:-1] + [1]), next_value ], dim=-1) rollouts.compute_returns(next_value, args.use_gae, args.tau) if args.algo in ['a2c']: batch_states = Variable(rollouts.states[0].view( -1, actor_critic.state_size)) batch_masks = Variable(rollouts.masks[:-1].view(-1, 1)) batch_obs = Variable(rollouts.observations[:-1].view( -1, *obs_shape)) batch_actions = Variable(rollouts.actions.view(-1, action_shape)) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( batch_obs, batch_states, batch_masks, batch_actions) if args.reward_predictor: batch_obs_prime = Variable(rollouts.observations[1:].view( -1, *obs_shape)) values = torch.cat([ actor_critic.predict_reward(batch_obs, batch_actions, batch_obs_prime), values ], dim=-1) returns_as_variable = Variable(rollouts.returns[:-1]) batched_v_loss = 0 values = values.view(returns_as_variable.size()) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = returns_as_variable - values value_loss = advantages.pow(2).sum(-1).mean() action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) * action_log_probs).mean() if args.reward_predictor: rp_error = (values[:, :, 0].data - rollouts.raw_rewards).pow(2).mean() advantages_list.append([ rp_error, advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0] ]) else: advantages_list.append( advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0]) optimizer.zero_grad() (batched_v_loss + value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = advantages[:, :, -1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ, observations_batch_prime, true_rewards_batch, \ noisy_observations_batch, true_observations_batch = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) if args.reward_predictor: values = torch.cat([ actor_critic.predict_reward( Variable(observations_batch), Variable(actions_batch), Variable(observations_batch_prime)), values ], dim=-1) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) td = (Variable(return_batch) - values).pow(2) value_loss = td.sum(-1).mean() if args.reward_predictor: rp_error = (values[:, 0].data - true_rewards_batch).pow(2).mean() advantages_list.append( [rp_error, td[:, -1].mean(0).data.cpu().numpy()]) else: advantages_list.append( td[:, -1].mean(0).data.cpu().numpy()) optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, " "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if len(advantages_list) > 2: advantages_array = np.array(advantages_list).reshape( -1, len(args.gamma)).T for g, gamma in enumerate(args.gamma): with open( args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv', "a") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow( [total_num_steps, np.mean(advantages_array[g])]) advantages_list = []
class ReinforceAgent(ResearchAgent): """The TensorForceAgent. Acts through the algorith, not here.""" def __init__(self, actor_critic, character=characters.Bomber, **kwargs): self._actor_critic = actor_critic super(ReinforceAgent, self).__init__(character, **kwargs) def cuda(self): self._actor_critic.cuda() if hasattr(self, "_rollout"): self._rollout.cuda() @property def model(self): return self._actor_critic @property def optimizer(self): return self._optimizer def set_eval(self): self._actor_critic.eval() def set_train(self): self._actor_critic.train() def _rollout_data(self, step, num_agent, num_agent_end=None): if num_agent_end is not None: assert (num_agent_end > num_agent) observations = Variable( self._rollout.observations[step, num_agent:num_agent_end], volatile=True) states = Variable(self._rollout.states[step, num_agent:num_agent_end], volatile=True) masks = Variable(self._rollout.masks[step, num_agent:num_agent_end], volatile=True) else: observations = Variable(self._rollout.observations[step, num_agent], volatile=True) states = Variable(self._rollout.states[step, num_agent], volatile=True) masks = Variable(self._rollout.masks[step, num_agent], volatile=True) return observations, states, masks def actor_critic_act(self, step, num_agent=0, deterministic=False): """Uses the actor_critic to take action. Args: step: The int timestep that we are acting. num_agent: Agent id that's running. Non-zero when agent has copies. Returns: See the actor_critic's act function in model.py. """ return self._actor_critic.act(*self.get_rollout_data(step, num_agent), deterministic=deterministic) def get_rollout_data(self, step, num_agent, num_agent_end=None): return self._rollout_data(step, num_agent, num_agent_end) def actor_critic_call(self, step, num_agent=0): observations, states, masks = self._rollout_data(step, num_agent) return self._actor_critic(observations, states, masks)[0].data def _evaluate_actions(self, observations, states, masks, actions): return self._actor_critic.evaluate_actions(observations, states, masks, actions) def _optimize(self, pg_loss, max_grad_norm, kl_loss=None, kl_factor=0, use_is=False): self._optimizer.zero_grad() loss = pg_loss if kl_factor > 0: # and not use_is: loss += kl_factor * kl_loss loss.backward() nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm) self._optimizer.step() if hasattr(self, '_scheduler'): self._scheduler.step(loss) def halve_lr(self): for i, param_group in enumerate(self._optimizer.param_groups): old_lr = float(param_group['lr']) new_lr = max(old_lr * 0.5, 1e-7) param_group['lr'] = new_lr def compute_advantages(self, next_value_agents, use_gae, gamma, tau): for num_agent, next_value in enumerate(next_value_agents): self._rollout.compute_returns(next_value, use_gae, gamma, tau, num_agent) advantages = self._rollout.compute_advantages(reinforce=True) diff = (advantages - advantages.mean()) advantages = diff / (advantages.std() + 1e-5) return advantages def initialize(self, args, obs_shape, action_space, num_training_per_episode, num_episodes, total_steps, num_epoch, optimizer_state_dict, num_steps, uniform_v, uniform_v_prior): params = self._actor_critic.parameters() self._optimizer = optim.Adam(params, lr=args.lr, eps=args.eps) if optimizer_state_dict: self._optimizer.load_state_dict(optimizer_state_dict) if args.use_lr_scheduler: self._scheduler = optim.lr_scheduler.ReduceLROnPlateau( self._optimizer, mode='min', verbose=True) self._rollout = RolloutStorage(num_steps, args.num_processes, obs_shape, action_space, self._actor_critic.state_size, num_training_per_episode) self.num_episodes = num_episodes self.total_steps = total_steps self.num_epoch = num_epoch self.uniform_v = uniform_v self.uniform_v_prior = uniform_v_prior def update_rollouts(self, obs, timestep): self._rollout.observations[timestep, :, :, :, :, :].copy_(obs) def insert_rollouts(self, step, current_obs, states, action, action_log_prob, value, reward, mask, action_log_prob_distr=None, dagger_prob_distr=None, expert_action_log_prob=None, training_action_log_prob=None): self._rollout.insert(step, current_obs, states, action, action_log_prob, value, reward, mask, action_log_prob_distr, dagger_prob_distr, expert_action_log_prob, training_action_log_prob) def reinforce(self, advantages, num_mini_batch, batch_size, num_steps, max_grad_norm, action_space, anneal=False, lr=1e-4, eps=1e-5, kl_factor=0, use_is=False, use_retrace=False, lambda_retrace=1.0): pg_losses = [] kl_losses = [] kl_loss = None total_losses = [] for sample in self._rollout.feed_forward_generator( advantages, num_mini_batch, batch_size, num_steps, action_space, kl_factor, use_is): observations_batch, states_batch, actions_batch, return_batch, \ masks_batch, old_action_log_probs_batch, adv_targ, \ action_log_probs_distr_batch, dagger_probs_distr_batch, \ expert_action_log_probs_batch, training_action_log_probs_batch = sample # Reshape to do in a single forward pass for all steps result = self._evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) values, action_log_probs, dist_entropy, states = result adv_targ = Variable(adv_targ) logprob = action_log_probs if use_is: behavior_action_probs_batch = kl_factor * torch.exp(Variable(expert_action_log_probs_batch)) + \ (1 - kl_factor) * torch.exp(Variable(training_action_log_probs_batch)) training_action_probs_batch = torch.exp( Variable(training_action_log_probs_batch)) training_single_action_probs_batch = training_action_probs_batch.gather( 1, Variable(actions_batch)) behavior_single_action_probs_batch = behavior_action_probs_batch.gather( 1, Variable(actions_batch)) is_ratio = training_single_action_probs_batch / behavior_single_action_probs_batch if use_retrace: truncated_ratio = np.array( [max(1, p.data[0]) for p in is_ratio]) truncated_ratio = Variable( torch.from_numpy( truncated_ratio).float().cuda().unsqueeze(1)) importance_weight = lambda_retrace * truncated_ratio else: importance_weight = is_ratio pg_loss = -(logprob * adv_targ * importance_weight).mean() print("\n#################################################\n") print("action log probs ", logprob) print("adv targ ", adv_targ) print("training probs ", training_action_probs_batch) print("behavior probs ", behavior_action_probs_batch) print("ratio ", is_ratio) print("IS ", importance_weight) print("loss: ", pg_loss) print("#################################################\n") else: pg_loss = -(logprob * adv_targ).mean() total_loss = pg_loss if kl_factor > 0: # and not use_is: criterion = nn.KLDivLoss() kl_loss = criterion(Variable(action_log_probs_distr_batch), Variable(dagger_probs_distr_batch)) total_loss += kl_factor * kl_loss self._optimize(total_loss, max_grad_norm, kl_loss, kl_factor, use_is) lr = self._optimizer.param_groups[0]['lr'] pg_losses.append(pg_loss.data[0]) if kl_factor > 0: # and not use_is: kl_losses.append(kl_loss.data[0]) total_losses.append(total_loss.data[0]) return pg_losses, kl_losses, total_losses, lr def copy_ex_model(self): """Creates a copy without the model. This is for operating with homogenous training.""" return ReinforceAgent(None, self._character) def after_epoch(self): self._rollout.after_epoch()
class PPOAgent(object): def __init__(self,args): self.args = args self.device = torch.device('cuda') if args.cuda else torch.device('cpu') dummy_env = gym.make(self.args.env_name) self.actor = ACNet(dummy_env.action_space.n,args.feedforward) del dummy_env if args.load_dir is not None: actorState = torch.load(args.load_dir,map_location=lambda storage, loc: storage) if args.continue_training: self.actor.load_state_dict(actorState) print("Loaded pretrained model successfully") if args.transfer: self.actor.load_autoturn_model(actorState) if args.cuda: self.actor.cuda() self.actor_optimizer = optim.Adam(self.actor.parameters(),lr=self.args.lr) self.env_list = [make_env(self.args.env_name,self.args.seed,i) for i in range(self.args.num_processes)] if self.args.num_processes > 1: self.envs = gym_vecenv.SubprocVecEnv(self.env_list) else: self.envs = gym_vecenv.DummyVecEnv(self.env_list) if len(self.envs.observation_space.shape) == 1: self.envs = gym_vecenv.VecNormalize(self.envs) self.obs_shape = self.envs.observation_space.shape self.obs_shape = (self.obs_shape[0] * args.num_stack, *self.obs_shape[1:]) self.state_shape = 1 if args.feedforward else 256 self.rollouts = RolloutStorage(self.args.num_fwd_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.state_shape) self.num_updates = int(args.num_frames)//args.num_fwd_steps//args.num_processes self.current_obs = torch.zeros(self.args.num_processes,*self.obs_shape) self.writer = SummaryWriter(log_dir=self.args.save_dir) self.fortress_threshold = 650 self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.actor_optimizer, mode='max',factor=0.2,patience=15,verbose=True,threshold=1e-3, threshold_mode='rel') #self.scheduler2 = torch.optim.lr_scheduler.MultiStepLR(self.actor_optimizer,milestones=[40,80],gamma=0.3) def update_current_obs(self,obs): shape_dim0 = self.envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if self.args.num_stack > 1: self.current_obs[:, :-shape_dim0] = self.current_obs[:, shape_dim0:] self.current_obs[:, -shape_dim0:] = obs def train(self): obs = self.envs.reset() self.update_current_obs(obs) self.rollouts.observations[0].copy_(self.current_obs) episode_rewards = torch.zeros([self.args.num_processes,1]) final_rewards = torch.zeros([self.args.num_processes,1]) num_destruction = 0 if self.args.cuda: self.current_obs = self.current_obs.cuda() self.rollouts.cuda() start = time.time() num_episodes = 0 for iteration in range(self.num_updates): for step in range(self.args.num_fwd_steps): with torch.no_grad(): value,action,action_log_prob,states = self.actor.act(self.rollouts.observations[step], self.rollouts.states[step], self.rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() obs,reward,done,info = self.envs.step(cpu_actions) num_destruction += sum(info) reward = torch.from_numpy(np.expand_dims(np.stack(reward),1)).float() episode_rewards += reward masks = torch.FloatTensor([[0.0] if i else [1.0] for i in done]) final_rewards*=masks final_rewards += (1-masks)*episode_rewards episode_rewards *= masks if self.args.cuda: masks = masks.cuda() if self.current_obs.dim() == 4: self.current_obs *= masks.unsqueeze(2).unsqueeze(2) else: self.current_obs *= masks self.update_current_obs(obs) self.rollouts.insert(step,self.current_obs,states,action,action_log_prob,value,reward,masks) with torch.no_grad(): next_value = self.actor.get_value(self.rollouts.observations[-1], self.rollouts.states[-1], self.rollouts.masks[-1]).detach() self.rollouts.compute_returns(next_value,True,self.args.gamma,self.args.tau) if not self.args.a2c: advantages = self.rollouts.returns[:-1] - self.rollouts.value_preds[:-1] advantages = (advantages - advantages.mean())/(advantages.std()+1e-5) for i in range(self.args.ppo_epoch): if self.args.feedforward: data_generator = self.rollouts.feed_forward_generator(advantages,self.args.num_mini_batch) else: data_generator = self.rollouts.recurrent_generator(advantages,self.args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample values,action_log_probs,dist_entropy,states = self.actor.evaluate_actions(observations_batch, states_batch, masks_batch, actions_batch) ratio = torch.exp(action_log_probs - old_action_log_probs_batch) surr1 = ratio*adv_targ surr2 = torch.clamp(ratio,1.0-self.args.clip_param,1.0+self.args.clip_param)*adv_targ action_loss = -torch.min(surr1,surr2).mean() value_loss = (values-return_batch).pow(2).mean() self.actor_optimizer.zero_grad() actorLoss = action_loss + self.args.value_loss_coeff*value_loss - self.args.entropy_coeff*dist_entropy actorLoss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(),self.args.max_grad_norm) self.actor_optimizer.step() else: values,action_log_probs,dist_entropy,states = self.actor.evaluate_actions(self.rollouts.observations[:-1].view(-1,*self.obs_shape), self.rollouts.states[0].view(-1,1 if self.args.feedforward else 256), self.rollouts.masks[:-1].view(-1,1), self.rollouts.actions.view(-1,1)) values = values.view(self.args.num_fwd_steps,self.args.num_processes,1) action_log_probs = action_log_probs.view(self.args.num_fwd_steps,self.args.num_processes,1) advantages = self.rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach()*action_log_probs).mean() self.actor_optimizer.zero_grad() actorLoss = action_loss + self.args.value_loss_coeff*value_loss - self.args.entropy_coeff*dist_entropy actorLoss.backward() self.actor_optimizer.step() self.rollouts.after_update() if num_destruction>self.fortress_threshold: torch.save(self.actor.state_dict(),self.args.save_dir+'/'+self.args.env_name+'_'+str(iteration)+'_ppo_actor.pth.tar') self.fortress_threshold = num_destruction if iteration%self.args.log_interval == 0: end = time.time() total_num_steps = (iteration+1)*self.args.num_processes*self.args.num_fwd_steps num_destruction /= self.args.num_processes print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, num fortress destroyed {:.2f}". format(iteration, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.item(), value_loss.item(), action_loss.item(),num_destruction)) self.writer.add_scalar('data/rewardmean',final_rewards.mean(),total_num_steps) self.writer.add_scalar('data/distentropy',dist_entropy.item(),total_num_steps) self.writer.add_scalar('data/valueloss',value_loss.item(),total_num_steps) self.writer.add_scalar('data/actionloss',action_loss.item(),total_num_steps) self.writer.add_scalar('data/numdestruction',num_destruction,total_num_steps) #self.scheduler.step(final_rewards.mean())i #self.scheduler2.step() num_destruction = 0 if iteration%self.args.save_interval==0: torch.save(self.actor.state_dict(),self.args.save_dir+'/'+self.args.env_name+'_'+str(iteration)+'_ppo_actor.pth.tar') self.writer.export_scalars_to_json(self.args.save_dir+'/'+self.args.env_name+"_all_scalars.json") self.envs.close() self.writer.close()
def main(): """ 主程序 :return: """ num_cls = args.wave_num * args.k + 1 # 所有的路由和波长选择组合,加上啥都不选 action_shape = 1 # action的维度,默认是1. num_updates = int( args.steps) // args.workers // args.num_steps # 梯度一共需要更新的次数 if args.append_route.startswith("True"): channel_num = args.wave_num + args.k else: channel_num = args.wave_num # 解析weight if args.weight.startswith('None'): weight = None else: weight = args.weight # 创建actor_critic if args.mode.startswith('alg'): # ksp(args, weight) return elif args.mode.startswith('learning'): # CNN学习模式下,osb的shape应该是CHW obs_shape = (channel_num, args.img_height, args.img_width) if args.cnn.startswith('mobilenetv2'): actor_critic = MobileNetV2(in_channels=channel_num, num_classes=num_cls, t=6) elif args.cnn.startswith('simplenet'): actor_critic = SimpleNet(in_channels=channel_num, num_classes=num_cls) elif args.cnn.startswith('simplestnet'): actor_critic = SimplestNet(in_channels=channel_num, num_classes=num_cls) elif args.cnn.startswith('alexnet'): actor_critic = AlexNet(in_channels=channel_num, num_classes=num_cls) elif args.cnn.startswith('squeezenet'): actor_critic = SqueezeNet(in_channels=channel_num, num_classes=num_cls, version=1.0) elif args.cnn.startswith('expandsimplenet'): actor_critic = ExpandSimpleNet(in_channels=channel_num, num_classes=num_cls, expand_factor=args.expand_factor) elif args.cnn.startswith('deepersimplenet'): actor_critic = DeeperSimpleNet(in_channels=channel_num, num_classes=num_cls, expand_factor=args.expand_factor) else: raise NotImplementedError # 创建optimizer if args.algo.startswith("a2c"): optimizer = optim.RMSprop(actor_critic.parameters(), lr=args.base_lr, eps=args.epsilon, alpha=args.alpha) elif args.algo.startswith("ppo"): optimizer = optim.Adam(actor_critic.parameters(), lr=args.base_lr, eps=args.epsilon) else: raise NotImplementedError else: raise NotImplementedError if args.cuda.startswith("True"): # 如果要使用cuda进行计算 actor_critic.cuda() # actor_critic = DistModule(actor_critic) # 判断是否是评估模式 if args.evaluate: print("evaluate mode") models = {} times = 1 prefix = "trained_models" directory = os.path.join(prefix, 'a2c', args.cnn, args.step_over) env = RwaGame(net_config=args.net, wave_num=args.wave_num, rou=args.rou, miu=args.miu, max_iter=args.max_iter, k=args.k, mode=args.mode, img_width=args.img_width, img_height=args.img_height, weight=weight, step_over=args.step_over) for model_file in reversed( sorted(os.listdir(directory), key=lambda item: int(item.split('.')[0]))): model_file = os.path.join(directory, model_file) print("evaluate model {}".format(model_file)) params = torch.load(model_file) actor_critic.load_state_dict(params['state_dict']) actor_critic.eval() models[params['update_i']] = {} print("model loading is finished") for t in range(times): total_reward, total_services, allocated_services = 0, 0, 0 obs, reward, done, info = env.reset() while not done: inp = Variable(torch.Tensor(obs).unsqueeze(0), volatile=True) # 禁止梯度更新 value, action, action_log_prob = actor_critic.act( inputs=inp, deterministic=True) # 确定性决策 action = action.data.numpy()[0] obs, reward, done, info = env.step(action=action[0]) total_reward += reward if reward == ARRIVAL_NEWPORT or reward == ARRIVAL_NOPORT: allocated_services += 1 if args.step_over.startswith('one_time'): if info: total_services += 1 elif args.step_over.startswith('one_service'): total_services += 1 else: raise NotImplementedError models[params['update_i']]['time'] = t models[params['update_i']]['reward'] = total_reward models[params['update_i']]['total_services'] = total_services models[params['update_i']][ 'allocated_services'] = allocated_services models[params['update_i']]['bp'] = ( total_services - allocated_services) / total_services # 输出仿真结果 # print("|updated model|test index|reward|bp|total services|allocated services|") # print("|:-----|:-----|:-----|:-----|:-----|:-----|") # for m in sorted(models): for i in range(times): print("|{up}|{id}|{r}|{bp:.4f}|{ts}|{als}|".format( up=params['update_i'], id=models[params['update_i']]['time'], r=models[params['update_i']]['reward'], bp=models[params['update_i']]['bp'], ts=models[params['update_i']]['total_services'], als=models[params['update_i']]['allocated_services'])) return # 创建游戏环境 envs = [ make_env(net_config=args.net, wave_num=args.wave_num, k=args.k, mode=args.mode, img_width=args.img_width, img_height=args.img_height, weight=weight, step_over=args.step_over) for _ in range(args.workers) ] envs = SubprocEnv(envs) # 创建游戏运行过程中相关变量存储更新的容器 rollout = RolloutStorage(num_steps=args.num_steps, num_processes=args.workers, obs_shape=obs_shape, action_shape=action_shape) current_obs = torch.zeros(args.workers, *obs_shape) observation, _, _, _ = envs.reset() update_current_obs(current_obs, observation, channel_num) rollout.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.workers, 1]) final_rewards = torch.zeros([args.workers, 1]) if args.cuda.startswith("True"): current_obs = current_obs.cuda() rollout.cuda() start = time.time() log_start = time.time() total_services = 0 # log_interval期间一共有多少个业务到达 allocated_services = 0 # log_interval期间一共有多少个业务被分配成功 update_begin = 0 # 判断是否是接续之前的训练 if args.resume: pms = torch.load(args.resume) actor_critic.load_state_dict(pms['state_dict']) optimizer.load_state_dict(pms['optimizer']) update_begin = pms['update_i'] print("resume process from update_i {}, with base_lr {}".format( update_begin, args.base_lr)) for updata_i in range(update_begin, num_updates): update_start = time.time() for step in range(args.num_steps): # 选择行为 inp = Variable(rollout.observations[step], volatile=True) # 禁止梯度更新 value, action, action_log_prob = actor_critic.act( inputs=inp, deterministic=False) # print(action) # 压缩维度,放到cpu上执行。因为没有用到GPU,所以并没有什么卵用,权当提示 cpu_actions = action.data.squeeze(1).cpu().numpy() # 观察observation,以及下一个observation envs.step_async(cpu_actions) obs, reward, done, info = envs.step_wait( ) # reward和done都是(n,)的numpy.ndarray向量 # if reward == ARRIVAL_NEWPORT_NEWPORT or reward == ARRIVAL_NOPORT_NEWPORT or reward == ARRIVAL_NOPORT_NOPORT: # allocated_services += 1 print(reward) for i in reward: if i == ARRIVAL_NEWPORT or i == ARRIVAL_NOPORT: allocated_services += 1 # allocated_services += (reward==ARRIVAL_NEWPORT_NEWPORT or reward==ARRIVAL_NOPORT_NEWPORT or reward==ARRIVAL_NOPORT_NOPORT).any().sum() # 计算分配成功的reward的次数 # TODO 未解决 if args.step_over.startswith('one_service'): total_services += (info == True).sum() # 计算本次step中包含多少个业务到达事件 # elif args.step_over.startswith('one_service'): # total_services += args.workers else: raise NotImplementedError reward = torch.from_numpy(np.expand_dims(reward, 1)).float() episode_rewards += reward # 累加reward分数 # 如果游戏结束,则重新开始计算episode_rewards和final_rewards,并且以返回的reward为初始值重新进行累加。 masks = torch.FloatTensor([[0.0] if d else [1.0] for d in done ]) # True --> 0, False --> 1 final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # if done[len(done)-1]: # print('游戏结束最终端口数量:',envs.get_all_edges_port()) if args.cuda.startswith("True"): masks = masks.cuda() # 给masks扩充2个维度,与current_obs相乘。则运行结束的游戏进程对应的obs值会变成0,图像上表示全黑,即游戏结束的画面。 current_obs *= masks.unsqueeze(2).unsqueeze(2) update_current_obs(current_obs=current_obs, obs=obs, channel_num=channel_num) # 把本步骤得到的结果存储起来 rollout.insert(step=step, current_obs=current_obs, action=action.data, action_log_prob=action_log_prob.data, value_pred=value.data, reward=reward, mask=masks) # TODO 强行停止 # envs.close() # return # 注意不要引用上述for循环定义的变量。下面变量的命名和使用都要注意。 next_inp = Variable(rollout.observations[-1], volatile=True) # 禁止梯度更新 next_value = actor_critic(next_inp)[0].data # 获取下一步的value值 rollout.compute_returns(next_value=next_value, use_gae=False, gamma=args.gamma, tau=None) if args.algo.startswith('a2c'): # 下面进行A2C算法梯度更新 inps = Variable(rollout.observations[:-1].view(-1, *obs_shape)) acts = Variable(rollout.actions.view(-1, action_shape)) # print("a2cs's acts size is {}".format(acts.size())) value, action_log_probs, cls_entropy = actor_critic.evaluate_actions( inputs=inps, actions=acts) print(cls_entropy.data) # print("inputs' shape is {}".format(inps.size())) # print("value's shape is {}".format(value.size())) value = value.view(args.num_steps, args.workers, 1) # print("action_log_probs's shape is {}".format(action_log_probs.size())) action_log_probs = action_log_probs.view(args.num_steps, args.workers, 1) # 计算loss advantages = Variable(rollout.returns[:-1]) - value value_loss = advantages.pow(2).mean() # L2Loss or MSE Loss action_loss = -(Variable(advantages.data) * action_log_probs).mean() total_loss = value_loss * args.value_loss_coef + action_loss - cls_entropy * args.entropy_coef optimizer.zero_grad() total_loss.backward() # 下面进行迷之操作。。梯度裁剪(https://www.cnblogs.com/lindaxin/p/7998196.html) nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) # average_gradients(actor_critic) optimizer.step() elif args.algo.startswith('ppo'): # 下面进行PPO算法梯度更新 advantages = rollout.returns[:-1] - rollout.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollout.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, cls_entropy = actor_critic.evaluate_actions( Variable(observations_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() # 事后一支烟 rollout.after_update() update_time = time.time() - update_start print("updates {} finished, cost time {}:{}".format( updata_i, update_time // 60, update_time % 60)) # print("total services is {}".format(total_services)) # 存储模型 if updata_i % args.save_interval == 0: save_path = os.path.join(args.save_dir, 'a2c') save_path = os.path.join(save_path, args.cnn) save_path = os.path.join(save_path, args.step_over) save_path = os.path.join(save_path, args.parameter) if os.path.exists(save_path) and os.path.isdir(save_path): pass else: os.makedirs(save_path) save_file = os.path.join(save_path, str(updata_i) + '.tar') save_content = { 'update_i': updata_i, 'state_dict': actor_critic.state_dict(), 'optimizer': optimizer.state_dict(), 'mean_reward': final_rewards.mean() } torch.save(save_content, save_file) # 输出日志 if updata_i % args.log_interval == 0: end = time.time() interval = end - log_start remaining_seconds = (num_updates - updata_i - 1) / args.log_interval * interval remaining_hours = int(remaining_seconds // 3600) remaining_minutes = int((remaining_seconds % 3600) / 60) total_num_steps = (updata_i + 1) * args.workers * args.num_steps blocked_services = total_services - allocated_services bp = blocked_services / total_services wave_port_num, total_port_num = envs.get_all_edges_port() wave_occ_sum, resource_utilization_rate = envs.get_resourceUtilization( ) print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, \ entropy {:.5f}, value loss {:.5f}, policy loss {:.8f}, remaining time {}:{}, 阻塞率为{}/{}={}, \ 各个波长端口数量为{}, 总的端口数量为{}, 带宽占用情况为{}, 资源占用率为{}".format( updata_i, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), cls_entropy.data, value_loss.data, action_loss.data, remaining_hours, remaining_minutes, blocked_services, total_services, bp, wave_port_num, total_port_num, wave_occ_sum, resource_utilization_rate)) # raise NotImplementedError total_services = 0 allocated_services = 0 log_start = time.time() envs.close()
if env_info.local_done[0]: break if step == 0: print('ppo update') # do ppo update next_value = agent(obs)[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps action_log_probs, dist_entropy, values = agent.evaluate_action( Variable(observations_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:] ) # I guess the obs_shape[0] is channel number if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # args.num_steps should be the length of interactions before each updating/training # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy( ) # returns are state value, sampled action, act_log_prob, hidden states # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert( step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks ) # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) # values should be values of observations, states are the hidden states used in rnn module, by pwang8 values = values.view( args.num_steps, args.num_processes, 1) # values are estimated current state values action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t) advantages = Variable( rollouts.returns[:-1] ) - values # This is also the definition of advantage value (action_value - state_value). value_loss = advantages.pow( 2).mean() # values are estimated current state_value(t) action_loss = -(Variable(advantages.data) * action_log_probs).mean() # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -( values - Variable(sample_values.data) ).pow(2).mean( ) # don't know what is the difference between this and just randomly sample some noise fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[: -1] # calculating the advantage value of an action advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) # The difference from this ppo optimization to the optimization above is that: it updates params for # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch # every time to calculate gradient. Sampling is conducted for optimization purpose. for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch # because params of the NN have not been updated at that time. But later, in other updating epochs, # this ratio will generate some error. The old_action_log_probs_batch will not be updated during # these param updating epochs. # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017 adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): num_updates = int( config.max_num_frames) // args.num_steps // config.a2c.num_processes n_times_is_converging = 0 print("num_updates: " + str(num_updates)) print("stop_learning: " + str(config.a2c.stop_learning)) # Initializing evaluation evaluator = Evaluator(evaluation_id) os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(config.env_name, args.seed, i, evaluation_id) for i in range(config.a2c.num_processes) ] if config.a2c.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) actor_critic = Policy(obs_numel, envs.action_space) # Maxime: log some info about the model and its size modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if config.a2c.algorithm == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif config.a2c.algorithm == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif config.a2c.algorithm == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, config.a2c.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(config.a2c.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([config.a2c.num_processes, 1]) final_rewards = torch.zeros([config.a2c.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() send_env_name = False for j in range(num_updates): if n_times_is_converging > 1: print("Converged...") break for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) evaluator.update(done, info) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) elif current_obs.dim() == 3: current_obs *= masks.unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if config.a2c.algorithm in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[:-1].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, config.a2c.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, config.a2c.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if config.a2c.algorithm == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if config.a2c.algorithm == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif config.a2c.algorithm == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() save_dir = "../a2c_trained_model/" if j % config.a2c.save_model_interval == 0: save_path = save_dir try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] if j % config.a2c.save_evaluation_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # if the environment name and the envelope state was not send if not send_env_name: evaluator.save(j, total_num_steps, final_rewards, dist_entropy, value_loss, action_loss, config.env_name, config.envelope) send_env_name = True else: evaluator.save(j, total_num_steps, final_rewards, dist_entropy, value_loss, action_loss) if evaluator.is_converging: n_times_is_converging += 1 else: n_times_is_converging = 0 print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if config.visdom and j % config.visdom_interval == 0: win = visdom_plot(total_num_steps, final_rewards.mean())
def main(): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] test_envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) test_envs = SubprocVecEnv(test_envs) else: envs = DummyVecEnv(envs) test_envs = DummyVecEnv(test_envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.saved_encoder_model: obs_shape = (args.num_stack, args.latent_space_size) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.resume_experiment: print("\n############## Loading saved model ##############\n") actor_critic, ob_rms = torch.load( os.path.join(save_path, args.env_name + args.save_tag + ".pt")) tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p")) if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) print(obs_shape) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) current_obs_test = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs, test=False): shape_dim0 = envs.observation_space.shape[0] if args.saved_encoder_model: shape_dim0 = 1 obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs))) obs = obs.data.cpu().numpy() obs = torch.from_numpy(obs).float() if not test: if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs else: if args.num_stack > 1: current_obs_test[:, : -shape_dim0] = current_obs_test[:, shape_dim0:] current_obs_test[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) reward_avg = 0 if args.cuda: current_obs = current_obs.cuda() current_obs_test = current_obs_test.cuda() rollouts.cuda() rollouts_test.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observation, reward and next obs obs, reward, done, info = envs.step(cpu_actions) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward = np.clip(reward, a_min=0, a_max=None) / 400 reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks tr.episodes_done += args.num_processes - masks.sum() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) tr.iterations_done += 1 if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save( save_model, os.path.join(save_path, args.env_name + args.save_tag + ".pt")) total_test_reward_list = [] step_test_list = [] for _ in range(args.num_tests): test_obs = test_envs.reset() update_current_obs(test_obs, test=True) rollouts_test.observations[0].copy_(current_obs_test) step_test = 0 total_test_reward = 0 while step_test < args.num_steps_test: value_test, action_test, action_log_prob_test, states_test = actor_critic.act( Variable(rollouts_test.observations[step_test], volatile=True), Variable(rollouts_test.states[step_test], volatile=True), Variable(rollouts_test.masks[step_test], volatile=True)) cpu_actions_test = action_test.data.squeeze( 1).cpu().numpy() # Observation, reward and next obs obs_test, reward_test, done_test, info_test = test_envs.step( cpu_actions_test) # masks here doesn't really matter, but still masks_test = torch.FloatTensor( [[0.0] if done_test_ else [1.0] for done_test_ in done_test]) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward_test = np.clip(reward_test, a_min=0, a_max=None) / 400 total_test_reward += reward_test[0] reward_test = torch.from_numpy( np.expand_dims(np.stack(reward_test), 1)).float() update_current_obs(obs_test) rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\ value_test.data, reward_test, masks_test) step_test += 1 if done_test: break #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget total_test_reward_list.append(total_test_reward) step_test_list.append(step_test) append_to(tr.test_reward, tr, sum(total_test_reward_list) / args.num_tests) append_to(tr.test_episode_len, tr, sum(step_test_list) / args.num_tests) logger.log_scalar_rl( "test_reward", tr.test_reward[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "test_episode_len", tr.test_episode_len[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) # Saving all the MyContainer variables tr.save( os.path.join(log_path, args.env_name + args.save_tag + ".p")) if j % args.log_interval == 0: reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean() end = time.time() tr.global_steps_done = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, tr.global_steps_done, int(tr.global_steps_done / (end - start)), reward_avg, dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) append_to(tr.pg_loss, tr, action_loss.data[0]) append_to(tr.val_loss, tr, value_loss.data[0]) append_to(tr.entropy_loss, tr, dist_entropy.data[0]) append_to(tr.train_reward_avg, tr, reward_avg) logger.log_scalar_rl( "train_pg_loss", tr.pg_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_val_loss", tr.val_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) """ print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]) ) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass