def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] if hparams['dropout'] == True: print ('CNNPolicy_dropout2') actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space) elif len(envs.observation_space.shape) == 3: print ('CNNPolicy2') actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() if self.opt == 'rms': self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') self.actor_critic = actor_critic self.rollouts = rollouts self.rollouts_list = RolloutStorage_list()
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) action_shape = envs.action_space.shape[0] print('+++++++++++++++++++++++++++++++++++++') print('obs_shape:', obs_shape) print('action_shape:', action_shape) print('+++++++++++++++++++++++++++++++++++++') if args.cuda: actor_critic.cuda() optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) print('%3d [%3d %3d %3d %3d] %3d' % (step, int(envs.convert_2_real_action(cpu_actions)[0, 0]), int(envs.convert_2_real_action(cpu_actions)[0, 1]), int(envs.convert_2_real_action(cpu_actions)[0, 2]), int(envs.convert_2_real_action(cpu_actions)[0, 3]), reward[0])) if reward[0] >= search_done_reward: sys.exit() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] actions_batch = rollouts.actions.view(-1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, j * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(): print("#######") print("WARNING: All rewards are not clipped or normalized ") print("#######") os.environ['OMP_NUM_THREADS'] = '1' envs = rafiki.Envs(args.num_processes, args.num_models, args.policy, args.beta, args.obs_size, args.max_latency, args.tau, args.cycle_len) obs_shape = envs.observation_space.shape actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() info_set = Info(args) for j in range(num_updates): for step in range(args.num_steps): logger.info('------------%d----------------' % j) # Sample actions with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[step])) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs logger.info(probs) obs, reward, info = envs.step(cpu_actions) info_set.insert(info) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() update_current_obs(obs) rollouts.insert(step, current_obs, action.data, action_log_prob.data, reward) if args.algo in ['a2c', 'ppo']: action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) R = rollouts.rewards.detach() optimizer.zero_grad() policy_loss = -R.reshape(args.num_steps, args.num_processes).mul(action_log_probs) policy_loss = sum(policy_loss) / len(policy_loss) policy_loss.backward() # nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm) optimizer.step() with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[-1])) logger.info(probs) rollouts.after_update() if j % args.log_interval == 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, reward {}, policy loss {}". format(j, total_num_steps, R.data, policy_loss.reshape(-1).data)) logger.info(args) info_set.show()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() viz_1 = Visdom() win = None win1 = None env_name_1 = 'HalfCheetahSmallFoot-v0' args.env_name = 'HalfCheetahSmallLeg-v0' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] envs_1 = [ make_env(env_name_1, args.seed, i, args.log_dir_1) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) envs_1 = SubprocVecEnv(envs_1) else: envs = DummyVecEnv(envs) envs_1 = DummyVecEnv(envs_1) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) envs_1 = VecNormalize(envs_1) #same for both tasks obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space) #same for both tasks action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_critic_1.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) optimizer_1 = optim.RMSprop(actor_critic_1.parameters(), args.lr, eps=args.eps, alpha=args.alpha) #Different for both tasks rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs_1.action_space, actor_critic_1.state_size) current_obs_1 = torch.zeros(args.num_processes, *obs_shape) #Different update functions def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def update_current_obs_1(obs): shape_dim0 = envs_1.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:] current_obs_1[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) obs_1 = envs_1.reset() update_current_obs_1(obs_1) rollouts.observations[0].copy_(current_obs) rollouts_1.observations[0].copy_(current_obs_1) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_rewards_1 = torch.zeros([args.num_processes, 1]) final_rewards_1 = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() current_obs_1 = current_obs_1.cuda() rollouts_1.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions from branch 1 value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) #Sample actions from branch 2 value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act( Variable(rollouts_1.observations[step], volatile=True), Variable(rollouts_1.states[step], volatile=True), Variable(rollouts_1.masks[step], volatile=True)) cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy() obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1) reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1), 1)).float() episode_rewards_1 += reward_1 masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done_1]) final_rewards_1 *= masks_1 final_rewards_1 += (1 - masks_1) * episode_rewards_1 episode_rewards_1 *= masks_1 if args.cuda: masks_1 = masks_1.cuda() if current_obs_1.dim() == 4: current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2) else: current_obs_1 *= masks_1 update_current_obs_1(obs_1) rollouts_1.insert(step, current_obs_1, states_1.data, action_1.data, action_log_prob_1.data, value_1.data, reward_1, masks_1) #Update for branch 1 next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() #share params branch 1 -> branch 2 actor_critic_1.a_fc1.weight.data = copy.deepcopy( actor_critic.a_fc1.weight.data) actor_critic_1.a_fc1.bias.data = copy.deepcopy( actor_critic.a_fc1.bias.data) actor_critic_1.v_fc1.weight.data = copy.deepcopy( actor_critic.v_fc1.weight.data) actor_critic_1.v_fc1.bias.data = copy.deepcopy( actor_critic.v_fc1.bias.data) #Update for branch 2 next_value_1 = actor_critic_1( Variable(rollouts_1.observations[-1], volatile=True), Variable(rollouts_1.states[-1], volatile=True), Variable(rollouts_1.masks[-1], volatile=True))[0].data rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma, args.tau) values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions( Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)), Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)), Variable(rollouts_1.masks[:-1].view(-1, 1)), Variable(rollouts_1.actions.view(-1, action_shape))) values_1 = values_1.view(args.num_steps, args.num_processes, 1) action_log_probs_1 = action_log_probs_1.view(args.num_steps, args.num_processes, 1) advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1 value_loss_1 = advantages_1.pow(2).mean() action_loss_1 = -(Variable(advantages_1.data) * action_log_probs_1).mean() optimizer_1.zero_grad() (value_loss_1 * args.value_loss_coef + action_loss_1 - dist_entropy_1 * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic_1.parameters(), args.max_grad_norm) optimizer_1.step() rollouts_1.after_update() #share params branch 2 -> branch 1 actor_critic.a_fc1.weight.data = copy.deepcopy( actor_critic_1.a_fc1.weight.data) actor_critic.a_fc1.bias.data = copy.deepcopy( actor_critic_1.a_fc1.bias.data) actor_critic.v_fc1.weight.data = copy.deepcopy( actor_critic_1.v_fc1.weight.data) actor_critic.v_fc1.bias.data = copy.deepcopy( actor_critic_1.v_fc1.bias.data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo, args.env_name + '_' + env_name_1) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = actor_critic_1 if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model_1 = copy.deepcopy(actor_critic_1).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] save_model_1 = [ save_model_1, hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) torch.save(save_model_1, os.path.join(save_path, env_name_1 + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) print( "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards_1.mean(), final_rewards_1.median(), final_rewards_1.min(), final_rewards_1.max(), dist_entropy_1.data[0], value_loss_1.data[0], action_loss_1.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1, args.algo) except IOError: pass
def main(): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None observation_space = np.zeros((3, 1)) action_space = np.zeros((4, 1)) obs_shape = np.shape(observation_space) obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) # if observation_space == 3: # actor_critic = CNNPolicy(obs_shape[0], action_space, args.recurrent_policy) # else: # assert not args.recurrent_policy, \ # "Recurrent policy is not implemented for the MLP controller" # actor_critic = MLPPolicy(obs_shape[0], action_space) actor_critic = MLPPolicy(obs_shape[0], action_space) action_shape = np.shape(action_space)[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = np.shape(observation_space)[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs.reshape(current_obs[:, -shape_dim0:].shape[1:]) obs = reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() print(action) # Observe reward and next obs obs, reward, done, info = envstep(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{" ":.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass