def evaluate(self,j,dist_entropy,value_loss,action_loss,model_file=None): end = time.time() total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps print("Updates {}, num timesteps {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, self.final_rewards.mean(), self.final_rewards.median(), self.final_rewards.min(), self.final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) try: # Sometimes monitor doesn't properly flush the outputs self.win = visdom_plot(self.viz, self.win, self.args.log_dir, self.args.env_name, self.args.algo) except IOError: pass
def train(self, num_updates): start = time.time() for j in range(num_updates): dist_entropy, value_loss, action_loss = self.run() if j % self.args.save_interval == 0 and self.args.save_dir != "": save_path = os.path.join(self.args.save_dir, self.args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = self.actor_critic if self.args.cuda: save_model = copy.deepcopy(self.actor_critic).cpu() save_model = [save_model, hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt")) if j % self.args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), self.final_rewards.mean(), self.final_rewards.median(), self.final_rewards.min(), self.final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if self.args.vis and j % self.args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs self.win = visdom_plot(self.viz, self.win, self.args.log_dir, self.args.env_name, self.args.algo) except IOError: pass
total_num_steps, int(total_num_steps / (end - start)), # FPS final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) # print('j: ', j, ' args.vis: ', args.vis, ' args.vis_interval: ', args.vis_interval) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames, bullet=True) except IOError: pass save_path = os.path.join(args.save_dir, args.algo) with open(save_path + '/Ave_Reward_per_epi(' + args.algo + ').txt', 'w') as f: for s in episodic_reward_graph: f.write(str(s) + '\n') print('Finish!!')
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ['CUDA_VISIBLE_DEVICES'] = "9" if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.use_cell: hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1) ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1) else: hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) if args.cuda: actor_critic=actor_critic.cuda() hs = hs.cuda() ft = ft.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, num_processes=args.num_processes, num_steps=args.num_steps, use_cell=args.use_cell, lenhs=args.lenhs,lenft=args.lenft, plan=args.plan, ac_intv=args.ac_interval, hs_intv=args.hs_interval, ft_intv=args.ft_interval ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, feat_size=512) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() rec_x = [] rec_y = [] file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w') hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda() hs_ind = torch.IntTensor(args.num_processes, 1).zero_() epinfobuf = deque(maxlen=100) start_time = time.time() for j in range(num_updates): print('begin sample, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) for step in range(args.num_steps): # Sample actions with torch.no_grad(): rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start_ind = max(hs_ind[i],step+1-args.lenhs) for ind in range(start_ind,step+1): h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c) hs_info[i,:]=h.view(1,2*actor_critic.hid_size) del h,c gc.collect() else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i]) hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info) value, action, action_log_prob, states = actor_critic.act( hidden_feat, rollouts.states[step]) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, infos = envs.step(cpu_actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo['r']]) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks) with torch.no_grad(): rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start = max(hs_ind[i], step + 1 - args.lenhs) for ind in range(start, step + 1): h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c) hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size) del h,c else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i]) hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info) next_value = actor_critic.get_value(hidden_feat).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) rollouts.compute_ft_ind() print('begin update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) value_loss, action_loss, dist_entropy = agent.update(rollouts) print('end update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps v_mean,v_median,v_min,v_max = safe(epinfobuf) print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), int(total_num_steps / (end - start_time)), v_mean, v_median, v_min, v_max, dist_entropy, value_loss, action_loss)) if not (v_mean==np.nan): rec_x.append(total_num_steps) rec_y.append(v_mean) file.write(str(total_num_steps)) file.write(' ') file.writelines(str(v_mean)) file.write('\n') if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name, args.env_name, args.num_frames) file.close()
def main(): print("###############################################################") print("#################### VISDOOM LEARNER START ####################") print("###############################################################") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None global envs envs = VecEnv( [make_env(i, args.config_path) for i in range(args.num_processes)], logging=True, log_dir=args.log_dir) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'a2c' or args.algo == 'acktr': actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) elif args.algo == 'a2t': source_models = [] files = glob.glob(os.path.join(args.source_models_path, '*.pt')) for file in files: print(file, 'loading model...') source_models.append(torch.load(file)) actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape, source_models) elif args.algo == 'resnet': # args.num_stack = 3 actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() if args.algo == 'a2c' or args.algo == 'resnet': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'a2t': a2t_params = [p for p in actor_critic.parameters() if p.requires_grad] optimizer = optim.RMSprop(a2t_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # print ('Actions:', cpu_actions, 'Rewards:', reward) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c' or args.algo == 'resnet': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) elif args.algo == 'a2t': nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo) except IOError: pass envs.close() time.sleep(5)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() viz_1 = Visdom() win = None win1 = None env_name_1 = 'HalfCheetahSmallFoot-v0' args.env_name = 'HalfCheetahSmallLeg-v0' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] envs_1 = [ make_env(env_name_1, args.seed, i, args.log_dir_1) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) envs_1 = SubprocVecEnv(envs_1) else: envs = DummyVecEnv(envs) envs_1 = DummyVecEnv(envs_1) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) envs_1 = VecNormalize(envs_1) #same for both tasks obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space) #same for both tasks action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_critic_1.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) optimizer_1 = optim.RMSprop(actor_critic_1.parameters(), args.lr, eps=args.eps, alpha=args.alpha) #Different for both tasks rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs_1.action_space, actor_critic_1.state_size) current_obs_1 = torch.zeros(args.num_processes, *obs_shape) #Different update functions def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def update_current_obs_1(obs): shape_dim0 = envs_1.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:] current_obs_1[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) obs_1 = envs_1.reset() update_current_obs_1(obs_1) rollouts.observations[0].copy_(current_obs) rollouts_1.observations[0].copy_(current_obs_1) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_rewards_1 = torch.zeros([args.num_processes, 1]) final_rewards_1 = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() current_obs_1 = current_obs_1.cuda() rollouts_1.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions from branch 1 value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) #Sample actions from branch 2 value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act( Variable(rollouts_1.observations[step], volatile=True), Variable(rollouts_1.states[step], volatile=True), Variable(rollouts_1.masks[step], volatile=True)) cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy() obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1) reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1), 1)).float() episode_rewards_1 += reward_1 masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done_1]) final_rewards_1 *= masks_1 final_rewards_1 += (1 - masks_1) * episode_rewards_1 episode_rewards_1 *= masks_1 if args.cuda: masks_1 = masks_1.cuda() if current_obs_1.dim() == 4: current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2) else: current_obs_1 *= masks_1 update_current_obs_1(obs_1) rollouts_1.insert(step, current_obs_1, states_1.data, action_1.data, action_log_prob_1.data, value_1.data, reward_1, masks_1) #Update for branch 1 next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() #share params branch 1 -> branch 2 actor_critic_1.a_fc1.weight.data = copy.deepcopy( actor_critic.a_fc1.weight.data) actor_critic_1.a_fc1.bias.data = copy.deepcopy( actor_critic.a_fc1.bias.data) actor_critic_1.v_fc1.weight.data = copy.deepcopy( actor_critic.v_fc1.weight.data) actor_critic_1.v_fc1.bias.data = copy.deepcopy( actor_critic.v_fc1.bias.data) #Update for branch 2 next_value_1 = actor_critic_1( Variable(rollouts_1.observations[-1], volatile=True), Variable(rollouts_1.states[-1], volatile=True), Variable(rollouts_1.masks[-1], volatile=True))[0].data rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma, args.tau) values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions( Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)), Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)), Variable(rollouts_1.masks[:-1].view(-1, 1)), Variable(rollouts_1.actions.view(-1, action_shape))) values_1 = values_1.view(args.num_steps, args.num_processes, 1) action_log_probs_1 = action_log_probs_1.view(args.num_steps, args.num_processes, 1) advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1 value_loss_1 = advantages_1.pow(2).mean() action_loss_1 = -(Variable(advantages_1.data) * action_log_probs_1).mean() optimizer_1.zero_grad() (value_loss_1 * args.value_loss_coef + action_loss_1 - dist_entropy_1 * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic_1.parameters(), args.max_grad_norm) optimizer_1.step() rollouts_1.after_update() #share params branch 2 -> branch 1 actor_critic.a_fc1.weight.data = copy.deepcopy( actor_critic_1.a_fc1.weight.data) actor_critic.a_fc1.bias.data = copy.deepcopy( actor_critic_1.a_fc1.bias.data) actor_critic.v_fc1.weight.data = copy.deepcopy( actor_critic_1.v_fc1.weight.data) actor_critic.v_fc1.bias.data = copy.deepcopy( actor_critic_1.v_fc1.bias.data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo, args.env_name + '_' + env_name_1) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = actor_critic_1 if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model_1 = copy.deepcopy(actor_critic_1).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] save_model_1 = [ save_model_1, hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) torch.save(save_model_1, os.path.join(save_path, env_name_1 + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) print( "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards_1.mean(), final_rewards_1.median(), final_rewards_1.min(), final_rewards_1.max(), dist_entropy_1.data[0], value_loss_1.data[0], action_loss_1.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1, args.algo) except IOError: pass
def train_a_gym_model(env, config): """We train gym-type RL problem using ppo given environment and configuration""" torch.set_num_threads(1) seed = config.get('seed', 1) log_dir = config.get('log_dir', '/tmp/gym') log_interval = config.get('log_interval', 10) save_interval = config.get('save_interval', 100) save_dir = config.get('save_dir', 'trained_models/ppo') add_timestep = config.get('add_timestep', False) num_processes = config.get('num_processes', 4) gamma = config.get('gamma', 0.99) num_stack = config.get('num_stack', 1) recurrent_policy = config.get('recurrent_policy', False) cuda = config.get('cuda', True) vis = config.get('vis', True) vis_interval = config.get('vis_interval', 100) env_name = config['env_name'] save_step = config.get('save_step', None) if save_step is not None: next_save_step = save_step # clean the log folder, if necessary try: os.makedirs(log_dir) except OSError: files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed(seed) if vis: from visdom import Visdom port = config.get('port', 8097) viz = Visdom(port=port) win = None envs = [make_env(env, seed, i, log_dir, add_timestep) for i in range(num_processes)] if num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: actor_critic.cuda() clip_param = config.get('clip_param', 0.2) ppo_epoch = config.get('ppo_epoch', 4) num_mini_batch = config.get('num_mini_batch', 32) value_loss_coef = config.get('value_loss_coef', 0.5) entropy_coef = config.get('entropy_coef', 0.01) lr = config.get('lr', 1e-3) eps = config.get('eps', 1e-5) max_grad_norm = config.get('max_grad_norm', 0.5) use_gae = config.get('use_gae', False) tau = config.get('tau', 0.95) num_steps = config.get('num_steps', 100) num_frames = config.get('num_frames', 1e6) num_updates = int(num_frames) // num_steps // num_processes agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=lr, eps=eps, max_grad_norm=max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) final_rewards = torch.zeros([num_processes, 1]) if cuda: current_obs = current_obs.cuda() rollouts.cuda() def save_the_model(num=None): """num is additional information""" # save it after training save_path = save_dir try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] if num is None: save_name = '%s.pt' % env_name else: save_name = '%s_at_%d.pt' % (env_name, int(num)) torch.save(save_model, os.path.join(save_path, save_name)) start = time.time() for j in range(1, 1 + num_updates): for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % save_interval == 0 and save_dir != "": save_the_model() if save_step is not None: total_num_steps = j * num_processes * num_steps if total_num_steps > next_save_step: save_the_model(total_num_steps) next_save_step += save_step if j % log_interval == 0: end = time.time() total_num_steps = j * num_processes * num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, env_name, 'ppo', num_frames) except IOError: pass # finally save model again save_the_model()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('args.lr') print(args.lr) # print('args.stat_decay') # print(args.stat_decay) # sys.exit() if args.algo == 'a2c': # print('args.eps') # print(args.eps) # sys.exit() agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo in ['acktr']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, stat_decay=args.stat_decay) elif args.algo in ['acktr-h**o']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, if_homo=True, stat_decay=args.stat_decay) elif args.algo in ['acktr-h**o-noEigen']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, if_homo=True, stat_decay=args.stat_decay, if_eigen=False) elif args.algo in ['kbfgs']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o-invertA']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay=args.stat_decay, if_invert_A=True) elif args.algo in ['kbfgs-h**o-invertA-decoupledDecay']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay_A=args.stat_decay_A, stat_decay_G=args.stat_decay_G, if_invert_A=True, if_decoupled_decay=True) elif args.algo in ['kbfgs-h**o-momentumGrad']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, if_momentumGrad=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o-noClip']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, if_clip=False, stat_decay=args.stat_decay) else: print('unknown args.algo for ' + args.algo) sys.exit() rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) record_rewards = [] record_num_steps = [] print('num_updates') print(num_updates) total_num_steps = 0 start = time.time() for j in range(num_updates): print('j') print(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: # print('info.keys()') # print(info.keys()) if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) print('info[episode][r]') print(info['episode']['r']) record_rewards.append(info['episode']['r']) # print('total_num_steps') # print(total_num_steps) # print('total_num_steps + (step + 1) * args.num_processes') # print(total_num_steps + (step + 1) * args.num_processes) record_num_steps.append(total_num_steps + (step + 1) * args.num_processes) # sys.exit() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, update_signal = agent.update( rollouts) if update_signal == -1: # sys.exit() break rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass print('record_rewards') print(record_rewards) dir_with_params = args.env_name + '/' +\ args.algo + '/' +\ 'eps_' + str(args.eps) + '/' +\ 'lr_' + str(args.lr) + '/' +\ 'stat_decay_' + str(args.stat_decay) + '/' # saving_dir = './result/' + args.env_name + '/' + args.algo + '/' saving_dir = './result/' + dir_with_params if not os.path.isdir(saving_dir): os.makedirs(saving_dir) import pickle with open(saving_dir + 'result.pkl', 'wb') as handle: pickle.dump( { 'record_rewards': record_rewards, 'record_num_steps': record_num_steps }, handle) print('args.log_dir') print(args.log_dir) print('os.listdir(args.log_dir)') print(os.listdir(args.log_dir)) # saving_dir_monitor = './result_monitor/' + args.env_name + '/' + args.algo + '/' saving_dir_monitor = './result_monitor/' + dir_with_params if os.path.isdir(saving_dir_monitor): import shutil shutil.rmtree(saving_dir_monitor) if not os.path.isdir(saving_dir_monitor): os.makedirs(saving_dir_monitor) print('saving_dir_monitor') print(saving_dir_monitor) import shutil for file_name in os.listdir(args.log_dir): full_file_name = os.path.join(args.log_dir, file_name) print('full_file_name') print(full_file_name) print('os.path.isfile(full_file_name)') print(os.path.isfile(full_file_name)) if os.path.isfile(full_file_name): shutil.copy(full_file_name, saving_dir_monitor) # print('os.listdir(saving_dir_monitor)') # print(os.listdir(saving_dir_monitor)) # print('len(os.listdir(saving_dir_monitor))') # print(len(os.listdir(saving_dir_monitor))) # print('args.num_processes') # print(args.num_processes) assert len(os.listdir(saving_dir_monitor)) == args.num_processes
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now( ).strftime("%Y-%m-%d-%H-%M-%S-%f") log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir, args.save_dir) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, args.add_timestep, device, False, frame_skip=args.frame_skip) if args.load_path: actor_critic, _ob_rms = torch.load(args.load_path) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.train() vec_norm.ob_rms = _ob_rms actor_critic.train() else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, beta=args.beta_dist, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=args.lr_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=args.lr_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, beta=args.sil_beta, value_loss_coef=args.sil_value_loss_coef, entropy_coef=args.sil_entropy_coef) replay = ReplayStorage(10000, num_processes=args.num_processes, gamma=args.gamma, prio_alpha=args.sil_alpha, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=actor_critic. recurrent_hidden_state_size, device=device) else: replay = None action_high = torch.from_numpy(envs.action_space.high).to(device) action_low = torch.from_numpy(envs.action_space.low).to(device) action_mid = 0.5 * (action_high + action_low) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) benchmark_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): # sample actions value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): clipped_action = action.clone() if args.shift_action: # FIXME experimenting with this, so far resulting in # faster learning when clipping guassian continuous # output (vs leaving centred at 0 and unscaled) clipped_action = 0.5 * clipped_action + action_mid clipped_action = torch.max( torch.min(clipped_action, action_high), action_low) else: clipped_action = action # act in environment and observe obs, reward, done, infos = envs.step(clipped_action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if 'rb' in info['episode']: benchmark_rewards.append(info['episode']['rb']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update( rollouts, j, replay) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps train_eprew = np.mean(episode_rewards) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), train_eprew, np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end='') if len(benchmark_rewards): print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format( np.mean(benchmark_rewards), np.median(benchmark_rewards), np.min(benchmark_rewards), np.max(benchmark_rewards)), end='') print() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) clipped_action = action if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): if args.shift_action: clipped_action = 0.5 * clipped_action + action_mid clipped_action = torch.max( torch.min(clipped_action, action_high), action_low) obs, reward, done, infos = eval_envs.step(clipped_action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() eval_eprew = np.mean(eval_episode_rewards) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), eval_eprew)) if len(episode_rewards ) and j % args.save_interval == 0 and save_dir != "": # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] ep_rewstr = ("%d" % train_eprew).replace("-", "n") save_filename = os.path.join( save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr)) torch.save(save_model, save_filename) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # Instantiate the environment config = getattr(configs, args.config)() # We make this in order to get the shapes. dummy_env = make_env(args, config, -1, [config['agent'](game_type=config['game_type'])])() envs_shape = dummy_env.observation_space.shape[1:] obs_shape = (envs_shape[0], *envs_shape[1:]) action_space = dummy_env.action_space if len(envs_shape) == 3: if args.model == 'convnet': actor_critic = lambda saved_model: PommeCNNPolicySmall( obs_shape[0], action_space, args) elif args.model == 'resnet': actor_critic = lambda saved_model: PommeResnetPolicy( obs_shape[0], action_space, args) else: actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space ) # We need to get the agent = config.agent(agent_id, config.game_type) and then # pass that agent into the agent.PPOAgent training_agents = [] saved_models = args.saved_models saved_models = saved_models.split( ',') if saved_models else [None] * args.nagents assert (len(saved_models)) == args.nagents for saved_model in saved_models: # TODO: implement the model loading. model = actor_critic(saved_model) agent = config['agent'](game_type=config['game_type']) agent = ppo_agent.PPOAgent(agent, model) training_agents.append(agent) if args.how_train == 'simple': # Simple trains a single agent against three SimpleAgents. assert ( args.nagents == 1), "Simple training should have a single agent." num_training_per_episode = 1 elif args.how_train == 'homogenous': # Homogenous trains a single agent against itself (self-play). assert (args.nagents == 1 ), "Homogenous toraining should have a single agent." num_training_per_episode = 4 elif args.how_train == 'heterogenous': assert (args.nagents > 1), "Heterogenous training should have more than one agent." print("Heterogenous training is not implemented yet.") return # NOTE: Does this work correctly? Will the threads operate independently? envs = [ make_env(args, config, i, training_agents) for i in range(args.num_processes) ] envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs) # TODO: Figure out how to render this for testing purposes. The following link may help: # https://github.com/MG2033/A2C/blob/master/envs/subproc_vec_env.py for agent in training_agents: agent.initialize(args, obs_shape, action_space, num_training_per_episode) current_obs = torch.zeros(num_training_per_episode, args.num_processes, *obs_shape) def update_current_obs(obs): current_obs = torch.from_numpy(obs).float() obs = envs.reset() update_current_obs(obs) if args.how_train == 'simple': training_agents[0].update_rollouts(obs=current_obs, timestep=0) elif args.how_train == 'homogenous': training_agents[0].update_rollouts(obs=current_obs, timestep=0) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) final_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() for agent in training_agents: agent.cuda() stats = utils.init_stats(args) start = time.time() for j in range(num_updates): for step in range(args.num_steps): value_agents = [] action_agents = [] action_log_prob_agents = [] states_agents = [] episode_reward = [] cpu_actions_agents = [] if args.how_train == 'simple': value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, 0) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions_agents = cpu_actions elif args.how_train == 'homogenous': cpu_actions_agents = [[] for _ in range(args.num_processes)] for i in range(4): value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, i) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() for num_process in range(args.num_processes): cpu_actions_agents[num_process].append( cpu_actions[num_process]) obs, reward, done, info = envs.step(cpu_actions_agents) reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1) episode_rewards += reward # import pdb; pdb.set_trace() if args.how_train == 'simple': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]) elif args.how_train == 'homogenous': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]).transpose(0, 1) final_rewards *= masks # nagents x nprocesses x 1 final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() reward_all = reward.unsqueeze(2) if args.how_train == 'simple': masks_all = masks.transpose(0, 1).unsqueeze(2) elif args.how_train == 'homogenous': masks_all = masks.unsqueeze(2) current_obs *= masks_all.unsqueeze(2).unsqueeze(2) update_current_obs(obs) states_all = torch.from_numpy( np.stack([x.data for x in states_agents])).float() action_all = torch.from_numpy( np.stack([x.data for x in action_agents])).float() action_log_prob_all = torch.from_numpy( np.stack([x.data for x in action_log_prob_agents])).float() value_all = torch.from_numpy( np.stack([x.data for x in value_agents])).float() if args.how_train in ['simple', 'homogenous']: training_agents[0].insert_rollouts(step, current_obs, states_all, action_all, action_log_prob_all, value_all, reward_all, masks_all) next_value_agents = [] if args.how_train == 'simple': agent = training_agents[0] next_value_agents.append(agent.run_actor_critic(-1, 0)) advantages = [ agent.compute_advantages(next_value_agents, args.use_gae, args.gamma, args.tau) ] elif args.how_train == 'homogenous': agent = training_agents[0] next_value_agents = [ agent.run_actor_critic(-1, num_agent) for num_agent in range(4) ] advantages = [ agent.compute_advantages(next_value_agents, args.use_gae, args.gamma, args.tau) ] final_action_losses = [] final_value_losses = [] final_dist_entropies = [] for num_agent, agent in enumerate(training_agents): for _ in range(args.ppo_epoch): data_generator = agent.feed_forward_generator( advantages[num_agent], args) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = agent.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() value_loss = (Variable(return_batch) - values).pow(2).mean() agent.optimize(value_loss, action_loss, dist_entropy, args.entropy_coef, args.max_grad_norm) final_action_losses.append(action_loss) final_value_losses.append(value_loss) final_dist_entropies.append(dist_entropy) agent.after_update() ##### # Save model. ##### if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # XXX: new way for saving model # XXX: we should also add the optimizer along with the state_dict for num_agent, agent in enumerate(training_agents): save_model = agent.get_model() save_optimizer = agent.get_optimizer() torch.save( { 'epoch': j, 'arch': args.model, 'state_dict': save_model.state_dict(), 'optimizer': save_optimizer.state_dict(), }, os.path.join( save_path, "train={}-config={}-model={}-agent={}.pt".format( args.how_train, args.config, args.model, num_agent))) ##### # Log to console. ##### if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, avg entropy {:.5f}, avg value loss {:.5f}, avg policy loss {:.5f}" .format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), np.mean([ dist_entropy.data[0] for dist_entropy in final_dist_entropies ]), np.mean([ value_loss.data[0] for value_loss in final_value_losses ]), np.mean([ action_loss.data[0] for action_loss in final_action_losses ]))) # save stats to h5 file # TODO: need to fix this error # stats = utils.log_stats(args, stats, j, int(total_num_steps / (end - start)), \ # final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), \ # np.mean([action_loss.data[0] for action_loss in final_action_losses]), \ # np.mean([value_loss.data[0] for value_loss in final_value_losses]), \ # np.mean([dist_entropy.data[0] for dist_entropy in final_dist_entropies])) # # log_path = os.path.join(args.log_dir) # filename_stats = '%s/stats.h5' % log_path # utils.save_dict(filename_stats, stats) ##### # Log to Visdom. ##### if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, 'ppo') except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None if args.num_processes > 1: if args.retro_contest == True: import json sonic_env_confs = json.load(open(args.sonic_config_file, 'r')) sonic_env_confs = sonic_env_confs['Train'] sonic_env_confs = [v for _, v in sonic_env_confs.items()] envs = SubprocVecSonicEnv(sonic_env_confs, args.num_processes) else: envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] envs = SubprocVecEnv(envs) else: envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) prev_saved_rew_median = float('-inf') actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if args.load_model: model_path = os.path.join(args.save_dir, args.algo, args.env_name) + ".pt" actor_critic, ob_rms, prev_saved_rew_median = torch.load(model_path) print("Loaded actor_critic model from:", model_path, "which got a median score of:", prev_saved_rew_median) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() prev_reward = 0.0 start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and final_rewards.median( ) > prev_saved_rew_median and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None, final_rewards.median() ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) prev_saved_rew_median = final_rewards.median() # Save a separate copy just in case the main saved model ends up being worser. # Helps to have a few saved models to choose from at test/runtime torch.save( save_model, os.path.join( save_path, args.env_name + str(final_rewards.median()) + '.pt')) print("Saved the state which got a median reward of", prev_saved_rew_median) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)] # env = get_test_env("001") envs = [lambda: get_test_env("000") for _ in range(args.num_processes)] # num_states = len(env.all_possible_states()) if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = OptionCritic(num_options, obs_shape[0], envs.action_space, args.recurrent_policy) else: # assert not args.recurrent_policy, \ # "Recurrent policy is not implemented for the MLP controller" # actor_critic = MLPPolicy(obs_shape[0], envs.action_space) raise NotImplementedError() if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': # optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) raise NotImplementedError() elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps = args.eps) elif args.algo == 'acktr': # optimizer = KFACOptimizer(actor_critic) raise NotImplementedError() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, num_options) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) optionSelection = 0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() #print(options) #print(options[0]) for j in range(num_updates): options = [-1] * args.num_processes for step in range(args.num_steps): # Choose Option t0 = time.time() selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) # print(new_option) for i in range(args.num_processes): if options[i] == -1: options[i] = new_option[i].data[0] #print("option is:") #print(options) t1 = time.time() # Sample actions value, action, action_log_prob, states = actor_critic.get_output( options, Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() t2 = time.time() # Termination term_value, termination, termination_log_prob, _ = actor_critic.get_termination( options, Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) termination = torch.LongTensor([termination[i].data[0] for i in range(termination.shape[0])]) t3 = time.time() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # newIndex = obs_to_int(obs) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks, options, termination) for i in range(termination.shape[0]): if termination[i] == 1: options[i] = -1 t4 = time.time() #print("part1") #print(t1 - t0) #print("part2") #print(t2-t1) #print("part3") #print(t3-t2) #print("part4") #print(t4-t3) for i in range(args.num_processes): if options[i]== -1: selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) # print(new_option) options[i] = new_option[i].data[0] rollouts.options[step+1].copy_(torch.LongTensor(options)) next_value = actor_critic.get_output(options,Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: raise NotImplementedError() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): for i in range(args.num_steps): # Get the ith step during exploration options = rollouts.options[i] #print(options) adv_targ = Variable(advantages[i]) old_action_log_probs = rollouts.action_log_probs[i] termination = rollouts.optionSelection[i] #print(termination) # Use critic value of option nn to update option parameters values, action_log_probs, dist_entropy, states = actor_critic.evaluate_option( Variable(rollouts.observations[i]), Variable(rollouts.states[i]), Variable(rollouts.masks[i]), Variable(rollouts.actions[i]), options) #print(action_log_probs) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(rollouts.returns[i]) - values).pow(2).mean() selection_log_prob = actor_critic.evaluate_selection( Variable(rollouts.observations[i]), Variable(rollouts.states[i]), Variable(rollouts.masks[i]), Variable(termination), Variable(rollouts.options[i].type(torch.cuda.LongTensor))) V_Omega = selection_log_prob * values # Update termination parameters termination_log_prob = actor_critic.evaluate_termination( Variable(rollouts.observations[i]), Variable(rollouts.states[i]), Variable(rollouts.masks[i]), Variable(termination.type(torch.cuda.LongTensor)), rollouts.options[i+1]) left_values = [] right_values = [] for i in range(args.num_processes): if int(termination[i]) == 1: left_values.append(V_Omega[i]) right_values.append(values[i]) elif int(termination[i]) == 0: left_values.append(values[i]) right_values.append(V_Omega[i]) left_values = torch.cat(left_values) right_values = torch.cat(right_values) termination_loss = (- torch.exp(termination_log_prob) * left_values - (1 - torch.exp(termination_log_prob)) * right_values).mean() optimizer.zero_grad() (action_loss + value_loss+ termination_loss - V_Omega.mean()).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) writer.add_scaler("final_reward_max", final_rewards.max(), plot_index) plot_index += 1 if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs print("hit") win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def train_maml(self, num_updates): start = time.time() theta_list = [] num_tasks = 1000 sample_size = 10 # episode_id: episode_id%10==0) # env = gym.wrappers.Monitor(self.envs, self.args.save_dir, video_callable=lambda episode_id: episode_id%10==0) # Create the variations needed task_list = [] for i in range(num_tasks): friction = np.random.randint(low=1, high=10, size=3).astype('float32')/10. friction_1 = np.random.uniform(low=0.1, high=0.8, size=3).astype('float32') task = {'default/geom': ['', 'friction', '{0:.1f} {1:.1f} {2:.1f}'.format( friction[0], friction[1], friction[2])], 'worldbody/body/body/geom': [[['name', 'fthigh'], ['type', 'capsule']], 'friction', '{0:.1f} {1:.1f} {2:.1f}'.format( friction_1[0], friction_1[1], friction_1[2])] } # task2 = {'option': ['gravity', '{0:.2f} {1:.2f} {2:.2f}'.format(0,0,gravity_z)]} task_list.append(task) for j in range(num_updates): sample_indexes = np.random.randint(0, num_tasks, size=sample_size) # Get the theta if j == 0: theta = self.get_weights() # Inner loop # First gradient for i, sample_index in enumerate(sample_indexes): # Get the task task = task_list[sample_index] env = self.envs.venv.envs[0] # env = gym.wrappers.Monitor(env.env, './videos2/', video_callable=lambda episode_id: episode_id%10==0) _tag_names = [] _tag_identifiers = [] _attributes = [] _values = [] for k in task.keys(): v = task[k] _tag_names.append(k) _tag_identifiers.append(v[0]) _attributes.append(v[1]) _values.append(v[2]) env.env.env.my_init(_tag_names, \ _tag_identifiers, _attributes, \ _values, None) # Set the model weights to theta before training self.set_weights(theta) dist_entropy, value_loss, action_loss = self.run() if j == 0: theta_list.append(self.get_weights()) else: print(i) theta_list[i] = self.get_weights() # Second gradiet theta_copy = deepcopy(theta) for k1, sample_index in enumerate(sample_indexes): # Get the task task = task_list[sample_index] env = self.envs.venv.envs[0] _tag_names = [] _tag_identifiers = [] _attributes = [] _values = [] for k in task.keys(): v = task[k] _tag_names.append(k) _tag_identifiers.append(v[0]) _attributes.append(v[1]) _values.append(v[2]) env.env.env.my_init(_tag_names, \ _tag_identifiers, _attributes, \ _values, None) # Get the network loss for this task for 1 episode # TODO: There should be no while loop # while self.a2c.n_episodes < 1: dist_entropy, value_loss, action_loss = self.meta_run(theta_list[k1],theta_copy) theta = self.get_weights() # Set the model weights to theta # self.set_weights(theta) # Update theta # Change the update network function # theta['state_dict'] = self.agent.update_net(theta['state_dict'],dist_entropy,value_loss,action_loss) # env = gym.wrappers.Monitor(env, './videos/', video_callable=lambda episode_id: episode_id%10==0,force=True) if j % self.args.save_interval == 0 and self.args.save_dir != "": save_path = os.path.join(self.args.save_dir, self.args.algo) try: os.makedirs(save_path) except OSError: pass model_state = {'num_updates': j, 'state_dict': self.actor_critic.state_dict(), 'optimizer': self.meta_optimizer.state_dict() } model_state = [model_state,hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None] torch.save(model_state, os.path.join(save_path, self.args.env_name + 'update_'+ str(j) +".pt")) # # A really ugly way to save a model to CPU # save_model = self.actor_critic # if self.args.cuda: # save_model = copy.deepcopy(self.actor_critic).cpu() # save_model = [save_model, # hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None] # torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt")) if j % self.args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), self.final_rewards.mean(), self.final_rewards.median(), self.final_rewards.min(), self.final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if self.args.vis and j % self.args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs self.win = visdom_plot(self.viz, self.win, self.args.log_dir, self.args.env_name, self.args.algo) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if len(envs.observation_space.shape) == 3: actor_critic = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) target_actor = Actor(obs_shape[0], envs.action_space, args.recurrent_policy, envs.action_space.n) critic = Critic(in_channels=4, num_actions=envs.action_space.n) critic_target = Critic(in_channels=4, num_actions=envs.action_space.n) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if args.cuda: actor_critic.cuda() critic.cuda() critic_target.cuda() target_actor.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) critic_optim = optim.Adam(critic.parameters(), lr=1e-4) gamma = 0.99 tau = 0.001 #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) mem_buffer = ReplayBuffer() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, envs.action_space.n) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) value = critic.forward( Variable(rollouts.observations[step], volatile=True), action_log_prob) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks pre_state = rollouts.observations[step].cpu().numpy() update_current_obs(obs) mem_buffer.add((pre_state, current_obs, action_log_prob.data.cpu().numpy(), reward, done)) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)) #[0].data next_value = critic.forward( Variable(rollouts.observations[-1], volatile=True), action_log_prob).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if True: state, next_state, action, reward, done = mem_buffer.sample(5) next_state = next_state.reshape([-1, *obs_shape]) state = state.reshape([-1, *obs_shape]) action = action.reshape([-1, 6]) next_q_values = critic_target( to_tensor(next_state, volatile=True), target_actor(to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True), to_tensor(next_state, volatile=True))[0]) next_q_values.volatile = False target_q_batch = to_tensor(reward) + args.gamma * to_tensor( done.astype(np.float)) * next_q_values critic.zero_grad() q_batch = critic(to_tensor(state), to_tensor(action)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() critic_optim.step() actor_critic.zero_grad() policy_loss = -critic( to_tensor(state), actor_critic(to_tensor(state), to_tensor(state), to_tensor(state))[0]) policy_loss = policy_loss.mean() policy_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() soft_update(target_actor, actor_critic, tau) soft_update(critic_target, critic, tau) ''' if args.algo in ['a2c', 'acktr']: action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) #advantages = Variable(rollouts.returns[:-1]) - values advantages = rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages) * action_log_probs).mean() #action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() critic_optim.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() critic_optim.step() ''' rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss.data.cpu().numpy()[0], policy_loss.data.cpu().numpy()[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
next_state, reward, done, _ = env.step(use_action) episode_reward += reward next_state = torch.Tensor([next_state]) state = next_state if done: break #writer.add_scalar('reward/test', episode_reward, i_episode) ''' end = time.time() total_num_steps = step * args.num_processes rewards.append(episode_reward) #print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) print( "Num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, entropy {:.5f}" .format(total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), value_loss, policy_loss, entropy.item())) if args.vis and step % args.log_interval == 0 and len( memory) > args.warmup: try: win = visdom_plot(viz, win, args.log_dir, args.env_name, 'disc_ddpg', args.num_frames) except IOError: pass env.close()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.load_model is not None: actor_critic = torch.load(args.load_model)[0] else: actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy, args.hidden_size, args) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, pop_art=args.pop_art) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() scale = 1. current_pdrr = [0., 0.] last_update = 0 ### parameters for adaptive reward scaling ### t_stop = 0 beta = .99 R_prev = -1e9 m_max = -1e9 m_t = 0 reverse = False last_scale_t = -1e9 ### for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # reward *= args.reward_scaling reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) t = j // args.adaptive_interval if args.pop_art: value_loss, action_loss, dist_entropy = agent.pop_art_update( rollouts) else: if t - last_scale_t > 100: value_loss, action_loss, dist_entropy = agent.update( rollouts, update_actor=True) else: value_loss, action_loss, dist_entropy = agent.update( rollouts, update_actor=False) if agent.max_grad_norm < .5 and t - last_scale_t < 100: agent.max_grad_norm += 0.00001 if j % args.adaptive_interval == 0 and j and t - last_scale_t > 100: t = j // args.adaptive_interval R_t = float('{}'.format(final_rewards.mean())) R_ts.append(R_t) assert type(R_t) == float t_stop += 1 m_t = beta * m_t + (1 - beta) * R_t m_hat = m_t / (1 - beta**t) print('m_hat :{}, t_stop: {}'.format(m_hat, t_stop)) print('agent.max_grad_norm, ', agent.max_grad_norm) if m_hat > m_max: m_max = m_hat t_stop = 0 if t_stop > args.tolerance: if reverse and m_max <= R_prev: break elif reverse and m_max > R_prev: agent.max_grad_norm = args.max_grad_norm_after actor_critic.rescale(args.cdec) scale *= args.cdec agent.reinitialize() last_scale_t = t elif not reverse and m_max <= R_prev: agent.max_grad_norm = args.max_grad_norm_after actor_critic.rescale(args.cdec) scale *= args.cdec agent.reinitialize() reverse = True last_scale_t = t else: agent.max_grad_norm = args.max_grad_norm_after actor_critic.rescale(args.cinc) scale *= args.cinc agent.reinitialize() last_scale_t = t R_prev = m_max j = t_stop = m_t = 0 m_max = -1e9 # if j % args.log_interval == 0: # this is used for testing saturation # relus = actor_critic.base_forward( # rollouts.observations[:-1].view(-1, *rollouts.observations.size()[2:])) rollouts.after_update() if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # relus = log_saturation(fname=args.saturation_log, # first=(j==0), # relus=[relu.cpu().detach().numpy() for relu in relus]) # print("saturation", relus) # if j > 0: # current_pdrr = incremental_update(current_pdrr, relus) print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, scale {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss, scale)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.plot_title, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None names = getListOfGames("train") envs = [make_env_train(names[i], args.seed, i, args.log_dir) for i in range(len(names))] # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO args.num_processes = len(envs) # REMEMBER YOU CHENGED IT if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape #print(obs_shape) obs_shape = (obs_shape[0], *obs_shape[1:]) #print(obs_shape) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) # Making it paralel actor_critic = torch.nn.parallel.DataParallel(actor_critic).module if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) # Make agent DataParallel agent = torch.nn.parallel.DataParallel(agent).module elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) # Make rollouts DataParallel rollouts = torch.nn.parallel.DataParallel(RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)).module current_obs = torch.nn.parallel.DataParallel(torch.zeros(envs.nenvs, *obs_shape)).module def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() # if args.num_stack > 1: # current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic.get_value(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) target_actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) target_actor_critic = MLPPolicy(obs_shape[0], envs.action_space) for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()): target_param.data.copy_(param.data) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_regularizer_criterion = nn.KLDivLoss() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) """ Used for KL Constraint in case of Continuous Action Stochastic Policies """ # target_values, target_action_log_probs, target_dist_entropy, target_states, target_action_mean, target_action_std = target_actor_critic.evaluate_actions_mean_and_std(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), # Variable(rollouts.states[0].view(-1, actor_critic.state_size)), # Variable(rollouts.masks[:-1].view(-1, 1)), # Variable(rollouts.actions.view(-1, action_shape))) # actor_regularizer_loss = (torch.log(action_std/target_action_std) + (action_std.pow(2) + (action_mean - target_action_mean).pow(2))/(2*target_action_std.pow(2)) - 0.5) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() ### Loss with regularizer added ##action_loss = -(Variable(advantages.data) * action_log_probs).mean() + args.actor_lambda * actor_regularizer_loss.mean(0).sum() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() total_loss = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef total_loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() ## Exponential average for target updates #if (j%args.target_update_interval == 0): # for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()): # target_param.data.copy_(args.target_tau * param.data + (1 - args.target_tau) * target_param.data) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max) # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def distil(teacher, student, optimizer, envs_teacher, envs_student_train, envs_student_test): ''' Trains the student on the teachers soft targets Note assumes that we are just trying to match the actions of the teacher not the values of the critic? ''' losses = [] if args.vis: from visdom import Visdom viz = Visdom() win1 = [None] * args.num_heads #student reward plots win2 = None #loss plots obs_shape = envs_teacher[0].observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if envs_teacher[0].action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs_teacher[0].action_space.shape[0] teacher_storage = [] student_storage_train = [] student_storage_test = [] for i in range(args.num_heads): teacher_storage.append( get_storage(envs_teacher[i], args.num_steps, args.num_processes, obs_shape, envs_teacher[i].action_space)) student_storage_train.append( get_storage(envs_student_train[i], args.num_steps, args.num_processes, obs_shape, envs_student_train[i].action_space)) student_storage_test.append( get_storage(envs_student_test[i], args.num_steps, args.num_processes, obs_shape, envs_student_test[i].action_space)) if args.cuda: for i in range(args.num_heads): teacher_storage[i]['current_obs'] = teacher_storage[i][ 'current_obs'].cuda() student_storage_train[i]['current_obs'] = student_storage_train[i][ 'current_obs'].cuda() student_storage_test[i]['current_obs'] = student_storage_test[i][ 'current_obs'].cuda() teacher_storage[i]['rollouts'].cuda() student_storage_train[i]['rollouts'].cuda() student_storage_test[i]['rollouts'].cuda() start = time.time() teacher_student_prob = [ 1 - args.frac_student_rollouts, args.frac_student_rollouts ] for j in range(num_updates): head = np.random.randint(args.num_heads) roll = np.random.choice(2, p=teacher_student_prob) #print('j: %d, Head: %d, Roll: %d'%(j,head, roll)) if roll == 1: # use student trajectory sample_rollouts(student, envs_student_train[head], student_storage_train[head], head) rollouts = student_storage_train[head]['rollouts'] else: # use teacher trajectory sample_rollouts(teacher[head], envs_teacher[head], teacher_storage[head]) rollouts = teacher_storage[head]['rollouts'] next_value = teacher[head]( Variable(rollouts.observations[-1], volatile=True))[0].data # value function # no clue what this does if hasattr(teacher[head], 'obs_filter'): teacher[head].obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # get loss and take grad step on student params loss = get_loss(student, teacher[head], rollouts, obs_shape, head) losses.append(loss.data.cpu().numpy()) optimizer.zero_grad() loss.backward() optimizer.step() if (j + 1) % args.save_interval == 0 and args.save_dir != "": save_checkpoint(student, optimizer, j) save_data(losses) # collect test trajectories sample_rollouts(student, envs_student_test[head], student_storage_test[head], head) student_next_value_test = student( Variable(student_storage_test[head]['rollouts'].observations[-1], volatile=True))[0].data # value function if hasattr(student, 'obs_filter'): student.obs_filter.update( student_storage_test[head]['rollouts'].observations[:-1].view( -1, *obs_shape)) student_storage_test[head]['rollouts'].compute_returns( student_next_value_test, args.use_gae, args.gamma, args.tau) # log student performance if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps for head in range(args.num_heads): print( "Head {} : Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, loss {:.5f}" .format( head, j, total_num_steps, int(total_num_steps / (end - start)), student_storage_test[head]['final_rewards'].mean(), student_storage_test[head]['final_rewards'].median(), student_storage_test[head]['final_rewards'].min(), student_storage_test[head]['final_rewards'].max(), loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs print('visualizing') for head in range(args.num_heads): win1[head] = visdom_plot(viz, win1[head], log_dir_student_test[head], args.env_name[head], 'Distilation Reward for Student') win2 = visdom_data_plot(viz, win2, args.env_name, 'Distilation Loss Plot', losses, 'loss') except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler( range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view( -1, *obs_shape)[indices] actions_batch = rollouts.actions.view( -1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions( Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print( "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, (j + 1) * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if j % args.vis_interval == 0: win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
def eval_pomme( saved_models='train=simple-config=ffa_v0-model=convnet-agent=0.pt'): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(server=args.server, port=8097) # viz = Visdom(port=args.port) win = None # Instantiate the environment config = getattr(configs, args.config)() # We make this in order to get the shapes. dummy_env = make_env(args, config, -1, [config['agent'](game_type=config['game_type'])])() envs_shape = dummy_env.observation_space.shape[1:] obs_shape = (envs_shape[0], *envs_shape[1:]) action_space = dummy_env.action_space if len(envs_shape) == 3: if args.model == 'convnet': actor_critic = lambda saved_model: PommeCNNPolicySmall( obs_shape[0], action_space, args) elif args.model == 'resnet': actor_critic = lambda saved_model: PommeResnetPolicy( obs_shape[0], action_space, args) else: actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space ) # TODO: this only works for simple - need a list of checkpoints for self-play # We need to get the agent = config.agent(agent_id, config.game_type) and then # pass that agent into the agent.PPOAgent training_agents = [] # TODO: this is a bit hacky and doesn't work for more than 1 model # saved_models = args.saved_models save_path = os.path.join(args.save_dir) saved_models = [os.path.join(save_path, saved_models)] # saved_models = saved_models.split(',') if saved_models else [None]*args.nagents assert (len(saved_models)) == args.nagents if len(envs_shape) == 3: if args.model == 'convnet': actor_critic_model = PommeCNNPolicySmall(obs_shape[0], action_space, args) elif args.model == 'resnet': actor_critic_model = PommeResnetPolicy(obs_shape[0], action_space, args) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic_model = MLPPolicy(obs_shape[0], action_space) print("****") for saved_model in saved_models: # TODO: implement the model loading. loaded_model = torch.load(saved_model) print("epoch of model {} is: {}".format(saved_model, loaded_model['epoch'])) loaded_actor_critic_model = actor_critic_model.load_state_dict( loaded_model['state_dict']) model = actor_critic(loaded_actor_critic_model) model.eval() agent = config['agent'](game_type=config['game_type']) agent = ppo_agent.PPOAgent(agent, model) training_agents.append(agent) print("****") if args.how_train == 'simple': # Simple trains a single agent against three SimpleAgents. assert ( args.nagents == 1), "Simple training should have a single agent." num_training_per_episode = 1 elif args.how_train == 'homogenous': # Homogenous trains a single agent against itself (self-play). assert (args.nagents == 1 ), "Homogenous toraining should have a single agent." num_training_per_episode = 4 elif args.how_train == 'heterogenous': assert (args.nagents > 1), "Heterogenous training should have more than one agent." print("Heterogenous training is not implemented yet.") return # NOTE: Does this work correctly? Will the threads operate independently? envs = [ make_env(args, config, i, training_agents) for i in range(args.num_processes) ] envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs) for agent in training_agents: agent.initialize(args, obs_shape, action_space, num_training_per_episode) current_obs = torch.zeros(num_training_per_episode, args.num_processes, *obs_shape) def update_current_obs(obs): current_obs = torch.from_numpy(obs).float() obs = envs.reset() update_current_obs(obs) if args.how_train == 'simple': training_agents[0].update_rollouts(obs=current_obs, timestep=0) elif args.how_train == 'homogenous': training_agents[0].update_rollouts(obs=current_obs, timestep=0) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) final_rewards = torch.zeros( [num_training_per_episode, args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() for agent in training_agents: agent.cuda() start = time.time() for j in range(args.num_steps_eval): for step in range(args.num_steps): value_agents = [] action_agents = [] action_log_prob_agents = [] states_agents = [] episode_reward = [] cpu_actions_agents = [] if args.how_train == 'simple': value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, 0) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions_agents = cpu_actions elif args.how_train == 'homogenous': cpu_actions_agents = [[] for _ in range(args.num_processes)] for i in range(4): value, action, action_log_prob, states = training_agents[ 0].act_pytorch(step, i) value_agents.append(value) action_agents.append(action) action_log_prob_agents.append(action_log_prob) states_agents.append(states) cpu_actions = action.data.squeeze(1).cpu().numpy() for num_process in range(args.num_processes): cpu_actions_agents[num_process].append( cpu_actions[num_process]) obs, reward, done, info = envs.step(cpu_actions_agents) reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1) episode_rewards += reward if args.how_train == 'simple': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]) elif args.how_train == 'homogenous': masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]).transpose(0, 1) masks = torch.FloatTensor( [[0.0] * num_training_per_episode if done_ else [1.0] * num_training_per_episode for done_ in done]).transpose(0, 1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() reward_all = reward.unsqueeze(2) masks_all = masks.unsqueeze(2) if args.how_train == 'simple': masks_all = masks.transpose(0, 1).unsqueeze(2) elif args.how_train == 'homogenous': masks_all = masks.unsqueeze(2) current_obs *= masks_all.unsqueeze(2).unsqueeze(2) update_current_obs(obs) states_all = torch.from_numpy( np.stack([x.data for x in states_agents])).float() action_all = torch.from_numpy( np.stack([x.data for x in action_agents])).float() action_log_prob_all = torch.from_numpy( np.stack([x.data for x in action_log_prob_agents])).float() value_all = torch.from_numpy( np.stack([x.data for x in value_agents])).float() if args.how_train in ['simple', 'homogenous']: training_agents[0].insert_rollouts(step, current_obs, states_all, action_all, action_log_prob_all, value_all, reward_all, masks_all) if step % args.log_interval == 0: print("step ", step) end = time.time() total_num_steps = (step + 1) * args.num_processes * args.num_steps_eval final_rewards_tr = torch.zeros( [args.num_processes, args.nagents, 1]) final_rewards_tr.copy_(final_rewards) final_rewards_tr = final_rewards_tr.view(args.num_processes, args.nagents).transpose( 0, 1) for i in range(args.nagents): print("agent # ", i) print( "Updates {}, Agent {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(step, i, total_num_steps, int(total_num_steps / (end - start)), final_rewards_tr[i].mean(), final_rewards_tr[i].median(), final_rewards_tr[i].min(), final_rewards_tr[i].max()), "\n") print("\n") if args.vis and step % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name) except IOError: pass
def main(): # Tensorboard Setup import tensorflow as tf import datetime # limit tf memory physical_devices = tf.config.list_physical_devices('GPU') try: tf.config.experimental.set_memory_growth(physical_devices[0], True) except: # Invalid device or cannot modify virtual devices once initialized. pass # setup tensorboard if args.tb_dir == 'tb': tb_log_dir = os.path.join( args.tb_dir, args.algo, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) else: tb_log_dir = os.path.join('tb', args.tb_dir) tb_summary_writer = tf.summary.create_file_writer(tb_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) meta = True if args.algo == 'ppometa' else False actor_critic = Policy(envs.observation_space.shape, envs.action_space, meta, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) elif args.algo == 'ppometa': agent = algo.PPOMeta(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], rollouts.actions[step], rollouts.prev_rewards[step], rollouts.prev_actions[step], rollouts.infos[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts_infos = None if len(infos) > 0 and 'box' in infos[0].keys( ) and 'agent' in infos[0].keys(): rollouts_infos = [] for info in infos: rollouts_infos.append( np.concatenate([info['box'].pos, info['agent'].pos])) rollouts_infos = torch.tensor(rollouts_infos) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, rollouts_infos) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], rollouts.actions[-1], rollouts.prev_rewards[-1], rollouts.prev_actions[-1], rollouts.infos[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j > 1 and j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save( save_model, os.path.join(save_path, args.env_name + "_" + str(j) + ".pt")) # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j > 1 and j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards))) with tb_summary_writer.as_default(): tf.summary.scalar('mean reward', np.mean(episode_rewards), step=total_num_steps) tf.summary.scalar('median reward', np.median(episode_rewards), step=total_num_steps) tf.summary.scalar( 'success rate', np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards), step=total_num_steps) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) with tb_summary_writer.as_default(): tf.summary.scalar('eval mean reward', np.mean(eval_episode_rewards), step=total_num_steps) tf.summary.scalar('eval median reward', np.median(eval_episode_rewards), step=total_num_steps) tf.summary.scalar( 'eval success rate', np.count_nonzero(np.greater(eval_episode_rewards, 0)) / len(eval_episode_rewards), step=total_num_steps) if j > 1 and args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, total_num_steps) except IOError: pass envs.close()
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = [] win_dic ={} for i in range(len(mt_env_id_dic_selected)): win += [None] win_afs_per_m = None win_afs_loss = None win_basic_loss = None plot_dic = {} envs = [] ''' Because the oral program has only one game per model, so Song add loop i So whatever you wanna run , just put in SubprocVecEnvMt! ''' for i in range(len(mt_env_id_dic_selected)): log_dir = args.log_dir+mt_env_id_dic_selected[i]+'/' for j in range(args.num_processes): envs += [make_env(mt_env_id_dic_selected[i], args.seed, j, log_dir)] ''' This envs is an intergration of all the running env''' envs = SubprocVecEnvMt(envs) num_processes_total = args.num_processes * len(mt_env_id_dic_selected) '''(1,128,128)''' obs_shape = envs.observation_space.shape #num_stack :number of frames to stack obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) from arguments import is_restore if is_restore and args.save_dir: load_path = os.path.join(args.save_dir, args.algo) actor_critic =torch.load(os.path.join(load_path, args.env_name + ".pt")) # print ("restored previous model!") # print (actor_critic.Variable) # print (sss) else: if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) #'args.num_steps: number of forward steps in A2C #rollouts is an intergration of state\ reward\ next state\action and so on rollouts = RolloutStorage(args.num_steps, num_processes_total, obs_shape, envs.action_space) current_state = torch.zeros(num_processes_total, *obs_shape) ''' not sure about it''' def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] # print (shape_dim0) # print (sss) state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes_total, 1]) final_rewards = torch.zeros([num_processes_total, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) from arguments import ewc, ewc_lambda, ewc_interval afs_per_m = [] afs_offset = [0.0]*gtn_M afs_loss_list = [] basic_loss_list = [] episode_reward_rec = 0.0 one = torch.FloatTensor([1]).cuda() mone = one * -1 '''for one whole game ''' for j in range(num_updates): for step in range(args.num_steps): if ewc == 1: try: states_store = torch.cat([states_store, rollouts.states[step].clone()], 0) except Exception as e: states_store = rollouts.states[step].clone() # Sample actions '''act fun refer to "observe it!"''' value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done = envs.step(cpu_actions) '''record the last 100 episodes rewards''' episode_reward_rec += reward episode_reward_rec = rec_last_100_epi_reward(episode_reward_rec,done) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() '''reward is shape of process_num_total, not batch-size''' # print ((reward).size()) # print (done) # print (sss) episode_rewards += reward ################ # rec_last_100_epi_reward(reward,done) # episode_reward_ppo += reward[0] # If done then clean the history of observations. final_rewards is used for compute after one whole num_step masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: # reset gradient optimizer.zero_grad() # forward values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # pre-process values = values.view(args.num_steps, num_processes_total, 1) action_log_probs = action_log_probs.view(args.num_steps, num_processes_total, 1) # compute afs loss afs_per_m_temp, afs_loss = actor_critic.get_afs_per_m( action_log_probs=action_log_probs, conv_list=conv_list, ) if len(afs_per_m_temp)>0: afs_per_m += [afs_per_m_temp] if (afs_loss is not None) and (afs_loss.data.cpu().numpy()[0]!=0.0): afs_loss.backward(mone, retain_graph=True) afs_loss_list += [afs_loss.data.cpu().numpy()[0]] advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() final_loss_basic = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef ewc_loss = None if j != 0: if ewc == 1: ewc_loss = actor_critic.get_ewc_loss(lam=ewc_lambda) if ewc_loss is None: final_loss = final_loss_basic else: final_loss = final_loss_basic + ewc_loss # print (final_loss_basic.data.cpu().numpy()[0]) # final_loss_basic basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]] final_loss.backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler(range(num_processes_total * args.num_steps)), args.batch_size * num_processes_total, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] actions_batch = rollouts.actions.view(-1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _, old_conv_list= old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() final_loss_basic = (value_loss + action_loss - dist_entropy * args.entropy_coef) basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]] final_loss_basic.backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # if j % int(num_updates/2-10) == 0 and args.save_dir != "": if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) import pickle with open(os.path.join(save_path, args.env_name + "_last_100_reward"), "wb") as f: pickle.dump(reward_dict, f) if j % args.log_interval == 0: print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, (j + 1) * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) try: print("ewc loss {:.5f}". format(ewc_loss.data.cpu().numpy()[0])) except Exception as e: pass if j > 5 and j % args.vis_interval == 0 and args.vis: ''' load from the folder''' for ii in range(len(mt_env_id_dic_selected)): log_dir = args.log_dir+mt_env_id_dic_selected[ii]+'/' win[ii] = visdom_plot(viz, win[ii], log_dir, mt_env_id_dic_selected[ii], args.algo) plot_dic = reward_dict for plot_name in plot_dic.keys(): # if plot_name not in win_dic: # win_dic[plot_name] = None if plot_name in win_dic.keys(): if len(plot_dic[plot_name]) > 0: win_dic[plot_name] = viz.line( torch.from_numpy(np.asarray(plot_dic[plot_name])), win=win_dic[plot_name], opts=dict(title=break_line_html(exp+'>>'+plot_name)) ) else: win_dic[plot_name] = None if len(afs_per_m)>0: win_afs_per_m = viz.line( torch.from_numpy(np.asarray(afs_per_m)), win=win_afs_per_m, opts=dict(title=title_html+'>>afs') ) # print (basic_loss_list) '''a2c:len(basic_loss_list) is vis_interval+1. because j start from 0 ppo:len(basic_loss_list) is (vis_interval+1)*ppo_epoch_4*len(BatchSampler) ''' # print (len(basic_loss_list)) # print (ss) win_basic_loss = viz.line( torch.from_numpy(np.asarray(basic_loss_list)), win=win_basic_loss, opts=dict(title=title_html+'>>basic_loss') ) if len(afs_loss_list) > 0: win_afs_loss = viz.line( torch.from_numpy(np.asarray(afs_loss_list)), win=win_afs_loss, opts=dict(title=title_html+'>>afs_loss') ) from arguments import parameter_noise, parameter_noise_interval if parameter_noise == 1: if j % parameter_noise_interval == 0: actor_critic.parameter_noise() if ewc == 1: if j % ewc_interval == 0 or j==0: actor_critic.compute_fisher(states_store) states_store = None actor_critic.star()
def main(): saved_model = os.path.join(args.save_dir, args.env_name + '.pt') if os.path.exists(saved_model) and not args.overwrite: actor_critic, ob_rms = \ torch.load(saved_model) agent = \ torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt')) for i in agent.optimizer.state_dict(): print(dir(agent.optimizer)) print(getattr(agent.optimizer, 'steps')) print(agent.optimizer.state_dict()[i]) past_steps = agent.optimizer.steps else: actor_critic = False agent = False past_steps = 0 try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None win_eval = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, None, args=args) if actor_critic: pass # vec_norm = get_vec_normalize(envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'map_width': args.map_width, 'num_actions': 18, 'recurrent': args.recurrent_policy}, curiosity=args.curiosity, algo=args.algo, model=args.model, args=args) actor_critic.to(device) evaluator = None if not agent: if args.algo == 'a2c': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, curiosity=args.curiosity, args=args) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True, curiosity=args.curiosity, args=args) if args.curiosity: rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, args=args) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates - past_steps): if args.drop_path: actor_critic.base.get_drop_path() player_act = None for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_probs, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], player_act=player_act, icm_enabled=args.curiosity) # Observe reward and next obs obs, reward, done, infos = envs.step(action) player_act = None if args.render: if infos[0]: if 'player_move' in infos[0].keys(): player_act = infos[0]['player_move'] if args.curiosity: # run icm with torch.no_grad(): feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act( (rollouts.obs[step], obs, action_bin) ) intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2. if args.no_reward: reward = 0 reward += intrinsic_reward.cpu() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.curiosity: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, feature_state, feature_state_pred, action_bin, action_dist_pred) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.curiosity: value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) save_agent = copy.deepcopy(agent) torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt')) #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if not dist_entropy: dist_entropy = 0 if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \ dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if args.curiosity: print("fwd/inv icm loss {:.1f}/{:.1f}\n". format( fwd_loss, inv_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): if evaluator is None: evaluator = Evaluator(args, actor_critic, device) if args.model == 'fractal': n_cols = evaluator.actor_critic.base.n_cols for i in range(-1, n_cols): evaluator.evaluate(column=i) #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes * args.max_step win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, args.env_name, args.algo, args.num_frames, n_graphs=args.n_recs) else: evaluator.evaluate(column=None) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def run(number_of_workers, log_dir, vis_title): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") print("#######") print("num_updates: {}".format(num_updates)) print("#######") try: os.makedirs(log_dir) except OSError: files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # Done: change make_env behaviour such that simple env is created; see custom_envs.py # args.env_name has to start with ng_ currently only WorkerMaintenanceEnv is working env_config = ENV_CONFIG.copy() # env_config['path_to_keras_expert_model'] = args.path_to_keras_expert_model env_config['number_of_workers'] = number_of_workers env_config['enable_0action_boost'] = args.enable_0action_boost envs = [ make_env(args.env_name, args.seed, i, log_dir, args.add_timestep, env_config) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, ob=not args.disable_env_normalize_ob, ret=not args.disable_env_normalize_rw, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) # Done: 2018/06/24. change Model in Policy to LSTM/GRU model (ref. CNN with gru); see model.py print("#######") print("action space.n : {}".format(envs.action_space.n)) print("#######") actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) if args.enable_debug_info_print: print("#####") print("cpu_action: {}".format(cpu_actions)) print("envs reward: {}".format(reward)) print("info stats reward: {}".format( info[0]["stats_relative_reward_regret"] + info[0]["stats_relative_reward_penalty"])) print("final_rewards after masks: {}".format(final_rewards)) print( "episode_rewards after masks: {}".format(episode_rewards)) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] model_name = "{}-{}-{}_w{}-{}.pt".format(args.env_name, args.algo, args.save_model_postfix, number_of_workers, j) torch.save(save_model, os.path.join(save_path, model_name)) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, vis_title, args.algo, args.num_frames) except IOError: pass # save final policy save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] model_name = "{}-{}-{}_w{}-final.pt".format(args.env_name, args.algo, args.save_model_postfix, number_of_workers) torch.save(save_model, os.path.join(save_path, model_name)) return True
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) with open(args.eval_env_seeds_file, 'r') as f: eval_env_seeds = json.load(f) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] eval_dir = os.path.join(args.log_dir, "eval/") if not os.path.exists(eval_dir): os.makedirs(eval_dir) eval_env = [ make_env(args.env_name, args.seed, 0, eval_dir, args.add_timestep, early_resets=True) ] eval_env = DummyVecEnv(eval_env) if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) if len(envs.observation_space.shape) == 1: # Don't touch rewards for evaluation eval_env = VecNormalize(eval_env, ret=False) # set running filter to be the same eval_env.ob_rms = envs.ob_rms obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass validation_returns = evaluate_with_seeds(eval_env, actor_critic, args.cuda, eval_env_seeds) report_results([ dict(name='validation_return', type='objective', value=np.mean(validation_returns)) ])
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() lmdb_idx = 0 try: os.makedirs(os.path.join(args.lmdb_path, args.env_name)) os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test')) except: print('Directory already exists.') for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs # obs, reward, done, info = envs.step(cpu_actions) '''unwrapped obs, reward''' obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions) # sample images # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2) for img, rwd in zip(wr_obs, wr_reward): if rwd > 0: lmdb_idx += 1 convert_to_lmdb( img, rwd, os.path.join(args.lmdb_path, args.env_name), lmdb_idx) # Evaluate unwrapped rewards # model = Model() # model.load(args.digit_checkpoint) # model.cuda() # accuracy = digit_eval(image, length_labels, digits_labels, model) # img.show() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None train_envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.no_norm, args.num_stack, args.log_dir, args.add_timestep, device, allow_early_resets=False) if args.eval_interval: eval_seed = args.seed if args.seed is None else args.seed + args.num_processes eval_envs = make_vec_envs(args.env_name, eval_seed, args.num_processes // 4, args.gamma, args.no_norm, args.num_stack, eval_log_dir, args.add_timestep, device=device, allow_early_resets=True, eval=True, rank_offsest=args.num_processes) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = train_envs.venv.ob_rms else: eval_envs = None print(train_envs.observation_space.shape) noisy_net = True actor_critic = create_policy( train_envs.observation_space, train_envs.action_space, name='basic', nn_kwargs={ #'batch_norm': False if args.algo == 'acktr' else True, 'recurrent': 'lstm' if args.recurrent_policy else '', 'hidden_size': 512, }, noisy_net=noisy_net, train=True) if args.resume and os.path.isfile(args.resume): print('Resuming from checkpoint (%s)' % args.resume) state_dict, ob_rms = torch.load(args.resume, map_location='cpu') actor_critic.load_state_dict(state_dict) actor_critic.to(device) if args.algo.startswith('a2c'): agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo.startswith('ppo'): agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, lr_schedule=lr_update_schedule, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo.endswith('sil'): agent = algo.SIL(agent, update_ratio=args.sil_update_ratio, epochs=args.sil_epochs, batch_size=args.sil_batch_size, value_loss_coef=args.sil_value_loss_coef or args.value_loss_coef, entropy_coef=args.sil_entropy_coef or args.entropy_coef) replay = ReplayStorage(1e5, args.num_processes, args.gamma, 0.1, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size, device=device) else: replay = None rollouts = RolloutStorage(args.num_steps, args.num_processes, train_envs.observation_space.shape, train_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = train_envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if noisy_net: actor_critic.reset_noise() for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = train_envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if replay is not None: replay.insert(rollouts.obs[step], rollouts.recurrent_hidden_states[step], action, reward, done) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, other_metrics = agent.update( rollouts, j, replay) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model.state_dict(), hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * update_factor if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, " "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss), end=', ' if other_metrics else '\n') if 'sil_value_loss' in other_metrics: print("SIL value/action loss {:.1f}/{:.1f}.".format( other_metrics['sil_value_loss'], other_metrics['sil_action_loss'])) if args.eval_interval and len( episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0: actor_critic.eval() eval_episode_rewards = [] num_eval_processes = args.num_processes // 4 obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( 2, num_eval_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_eval_processes, 1, device=device) while len(eval_episode_rewards) < 50: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) actor_critic.train() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass