def main(): args = get_args() if not args.training: print("Warning! Training is turned off!") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Set up the directories and names for saving stuff session_name = utils.datetimenow(subseconds=True) if args.load_id: loading_id = args.load_id cutoff_idx = loading_id.find('_s') model_name = loading_id[1:cutoff_idx] else: model_name = session_name #i.e. if new model, model_name is same as session_name unique_id = 'm'+ model_name + '_s' + session_name print("The unique ID for this model and session combination " + \ "is %s" % str(unique_id)) # Make dirs to log experimental data, models exp_dir = '../exps' if not os.path.isdir(exp_dir): os.mkdir(exp_dir) data_logs_dir = os.path.join(exp_dir, 'data_logs') if not os.path.isdir(data_logs_dir): os.mkdir(data_logs_dir) data_logs_dir_uniq = os.path.join(data_logs_dir, unique_id) if not os.path.isdir(data_logs_dir_uniq): os.mkdir(data_logs_dir_uniq) print("Data will be logged to %s. " % data_logs_dir_uniq + \ "Ignore the tmp/openai logging statement below.") models_dir = os.path.join(exp_dir, 'models') if not os.path.isdir(models_dir): os.mkdir(models_dir) print("New model will be saved at %s" % models_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # Save args into csv for record keeping utils.save_configs_to_csv(args, session_name=session_name, model_name=model_name, unique_id=unique_id) # Set up envs and model etc envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, data_logs_dir_uniq, device, False) if args.load_id: loaded_id = str(args.load_id) path = '../exps/models/' + loaded_id + '.pt' actor_critic = torch.load(path) else: actor_critic = Policy( obs_shape=envs.observation_space.shape, action_space=envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C( args, actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( args, # todo change ppo script to include args (which was added # to help make the training fully episodic and with overlapping # segments) actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(200, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_episodes = int(10e9) for j in range(num_episodes): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_episodes, args.lr) for step in range(200): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = \ actor_critic.act(rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs # envs.render() obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): # Only added at the end of an epi episode_rewards.append(info['episode']['r']) p_dists = torch.FloatTensor( [info['p_dist'] if 'p_dist' in info.keys() else np.zeros(2) for info in infos]) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, p_dists) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.training: value_loss, action_loss, dist_entropy = agent.update(rollouts) if args.save_experimental_data: rollouts.save_experimental_data(save_dir=data_logs_dir_uniq) if "Bandit" in args.env_name: reset_hxs_every_episode = True else: reset_hxs_every_episode = False rollouts.after_update(reset_hxs_every_episode) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_episodes - 1): torch.save(actor_critic, os.path.join(models_dir, unique_id + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1 and args.training: total_num_steps = (j + 1) * args.num_processes * 200 end = time.time() print( "Episodes {}, num timesteps {}, FPS {}. Entropy: {:.4f} , Value loss: {:.4f}, Policy loss: {:.4f}, \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), dist_entropy, value_loss, action_loss, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) elif j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * 200 end = time.time() print( "Episodes {}, num timesteps {}. \n" .format(j, total_num_steps)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, data_logs_dir_uniq, device)
def pg(envs, printout, use_gail=False): if use_gail: assert len(envs.observation_space.shape) == 1 discr = gail_util.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( '/home/paperspace/repos/pytorch-a2c-ppo-acktr-gail/gail_experts', "trajs_reacher.pt") gail_train_loader = torch.utils.data.DataLoader( gail_util.ExpertDataset(file_name, num_trajectories=4, subsample_step=4), batch_size=ppo_args.gail_batchsize, shuffle=True, drop_last=True) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = algo.PPO(actor_critic=actor_critic, clip_param=ppo_args.clip_param, ppo_epoch=ppo_args.ppo_epoch, num_mini_batch=ppo_args.num_mb, value_loss_coef=ppo_args.vloss_coef, entropy_coef=ppo_args.entropy_coef, lr=ppo_args.lr, eps=ppo_args.adam_eps, max_grad_norm=.5) rollouts = storage.RolloutStorage(ppo_args.num_steps, ppo_args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) num_updates = int( ppo_args.total_steps) // ppo_args.num_steps // ppo_args.num_processes episode_rewards = deque(maxlen=10) scores = np.zeros((ppo_args.num_envs, 1)) final_scores = np.zeros((ppo_args.num_envs, 1)) start = timer() for j in range(num_updates): utils.update_linear_schedule(agent.optimizer, j, num_updates, ppo_args.lr) for step in range(ppo_args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.ones_like(masks) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) if use_gail: if j >= 10: envs.venv.eval() gail_epoch = ppo_args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(ppo_args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], ppo_args.gamma, rollouts.masks[step]) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, ppo_args.use_gae, ppo_args.gamma, ppo_args.gae_lambda, ppo_args.time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() save_path = 'saved_models' save_interval = 100 # save for every interval-th update or for the last epoch if (j % save_interval == 0 or j == num_updates - 1): torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, "ppo" + env_name + ".pt")) log_interval = 10 if j % log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * ppo_args.num_processes * ppo_args.num_steps end = timer() printout( f'Updates {j}, num timesteps {total_num_steps}, FPS { int(total_num_steps / (end - start))} \n ' f'Last {len(episode_rewards)} training episodes: mean/median reward {np.mean(episode_rewards):.1f}/{ np.median(episode_rewards):.1f}, ' f'min/max reward {np.min(episode_rewards):.1f}/{np.max(episode_rewards):.1f}' )
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) save_path = os.path.join(args.save_dir, args.algo) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) if args.load: # actor_critic,ob_rms2=torch.load(os.path.join(save_path, args.env_name + ".pt")) # evaluate(actor_critic, ob_rms2, args.env_name, args.seed, # args.num_processes, eval_log_dir, device) #actor_critic.eval() #exit() #.state_dict() actor_critic, agent.optimizer, start_epoch = load_checkpoint( actor_critic, agent.optimizer, os.path.join(save_path, args.env_name + ".pt")) actor_critic = actor_critic.to(device) for state in agent.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) else: start_epoch = 0 rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) # ob_rms=ob_rms2 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(start_epoch, num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(save_path) except OSError: pass state = { 'epoch': j + 1, 'state_dict': actor_critic.state_dict(), 'optimizer': agent.optimizer.state_dict() } torch.save(state, os.path.join(save_path, args.env_name + ".pt")) # torch.save([ # actor_critic, # getattr(utils.get_vec_normalize(envs), 'ob_rms', None) # ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): chrono = exp.chrono() envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(args.repeat): with chrono.time('train') as t: for n in range(args.number): if args.use_linear_lr_decay: utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() # --- rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) exp.log_batch_loss(action_loss) exp.log_metric('value_loss', value_loss) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) # -- number # -- chrono exp.show_eta(j, t) # -- epoch exp.report() envs.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # coinrun environments need to be treated differently. coinrun_envs = { 'CoinRun': 'standard', 'CoinRun-Platforms': 'platform', 'Random-Mazes': 'maze' } envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, coin_run_level=args.num_levels, difficulty=args.high_difficulty, coin_run_seed=args.seed) if args.env_name in coinrun_envs.keys(): observation_space_shape = (3, 64, 64) args.save_dir = args.save_dir + "/NUM_LEVELS_{}".format( args.num_levels) # Save the level info in the else: observation_space_shape = envs.observation_space.shape # trained model name if args.continue_ppo_training: actor_critic, _ = torch.load(os.path.join(args.check_point, args.env_name + ".pt"), map_location=torch.device(device)) elif args.cor_gail: embed_size = args.embed_size actor_critic = Policy(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, embed_size=embed_size, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) correlator = Correlator(observation_space_shape, envs.action_space, hidden_dim=args.hidden_size, embed_dim=embed_size, lr=args.lr, device=device) correlator.to(device) embeds = torch.zeros(1, embed_size) else: embed_size = 0 actor_critic = Policy(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) embeds = None if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_clipped_value_loss=True, ftrl_mode=args.cor_gail or args.no_regret_gail, correlated_mode=args.cor_gail) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail or args.no_regret_gail or args.cor_gail: file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=50, subsample_frequency=1) #if subsample set to a different number, # grad_pen might need adjustment drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) if args.gail: discr = gail.Discriminator(observation_space_shape, envs.action_space, device=device) if args.no_regret_gail or args.cor_gail: queue = deque( maxlen=args.queue_size ) # Strategy Queues: Each element of a queue is a dicr strategy agent_queue = deque( maxlen=args.queue_size ) # Strategy Queues: Each element of a queue is an agent strategy pruning_frequency = 1 if args.no_regret_gail: discr = regret_gail.NoRegretDiscriminator(observation_space_shape, envs.action_space, device=device) if args.cor_gail: discr = cor_gail.CorDiscriminator(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, embed_size=embed_size, device=device) discr.to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, observation_space_shape, envs.action_space, actor_critic.recurrent_hidden_state_size, embed_size) obs = envs.reset() rollouts.obs[0].copy_(obs) if args.cor_gail: rollouts.embeds[0].copy_(embeds) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions # Roll-out with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], rollouts.embeds[step]) obs, reward, done, infos = envs.step(action.to('cpu')) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) # Sample mediating/correlating actions # Correlated Roll-out if args.cor_gail: embeds, embeds_log_prob, mean = correlator.act( rollouts.obs[step], rollouts.actions[step]) rollouts.insert_embedding(embeds, embeds_log_prob) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], rollouts.embeds[-1]).detach() if args.gail or args.no_regret_gail or args.cor_gail: if args.env_name not in {'CoinRun', 'Random-Mazes'}: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if args.gail: if j < 10: gail_epoch = 100 # Warm up # no need for gail epoch or warm up in the no-regret case and cor_gail. for _ in range(gail_epoch): if utils.get_vec_normalize(envs): obfilt = utils.get_vec_normalize(envs)._obfilt else: obfilt = None if args.gail: discr.update(gail_train_loader, rollouts, obfilt) if args.no_regret_gail or args.cor_gail: last_strategy = discr.update(gail_train_loader, rollouts, queue, args.max_grad_norm, obfilt, j) for step in range(args.num_steps): if args.gail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) if args.no_regret_gail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step], queue) if args.cor_gail: rollouts.rewards[ step], correlator_reward = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], rollouts.embeds[step], args.gamma, rollouts.masks[step], queue) rollouts.correlated_reward[step] = correlator_reward rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.gail: value_loss, action_loss, dist_entropy = agent.update(rollouts, j) elif args.no_regret_gail or args.cor_gail: value_loss, action_loss, dist_entropy, agent_gains, agent_strategy = \ agent.mixed_update(rollouts, agent_queue, j) if args.cor_gail: correlator.update(rollouts, agent_gains, args.max_grad_norm) if args.no_regret_gail or args.cor_gail: queue, _ = utils.queue_update(queue, pruning_frequency, args.queue_size, j, last_strategy) agent_queue, pruning_frequency = utils.queue_update( agent_queue, pruning_frequency, args.queue_size, j, agent_strategy) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass if not args.cor_gail: torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) else: print("saving models in {}".format( os.path.join(save_path, args.env_name))) torch.save( correlator.state_dict(), os.path.join(save_path, args.env_name + "correlator.pt")) torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "actor.pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}," " value loss/action loss {:.1f}/{}".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def inner_loop_ppo(args, learning_rate, num_steps, num_updates, inst_on, visualize, save_dir): torch.set_num_threads(1) log_writer = SummaryWriter(save_dir, max_queue=1, filename_suffix="log") device = torch.device("cpu") env_name = ENV_NAME # "Safexp-PointGoal1-v0" envs = make_vec_envs(env_name, np.random.randint(2**32), NUM_PROC, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) eval_envs = make_vec_envs(env_name, np.random.randint(2**32), 1, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) actor_critic_policy = init_default_ppo(envs, log(args.init_sigma)) # Prepare modified observation shape for instinct obs_shape = envs.observation_space.shape inst_action_space = deepcopy(envs.action_space) inst_obs_shape = list(obs_shape) inst_obs_shape[0] = inst_obs_shape[0] + envs.action_space.shape[0] # Prepare modified action space for instinct inst_action_space.shape = list(inst_action_space.shape) inst_action_space.shape[0] = inst_action_space.shape[0] + 1 inst_action_space.shape = tuple(inst_action_space.shape) actor_critic_instinct = Policy(tuple(inst_obs_shape), inst_action_space, init_log_std=log(args.init_sigma), base_kwargs={'recurrent': False}) actor_critic_policy.to(device) actor_critic_instinct.to(device) agent_policy = algo.PPO(actor_critic_policy, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=learning_rate, eps=args.eps, max_grad_norm=args.max_grad_norm) agent_instinct = algo.PPO(actor_critic_instinct, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=learning_rate, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts_rewards = RolloutStorage( num_steps, NUM_PROC, envs.observation_space.shape, envs.action_space, actor_critic_policy.recurrent_hidden_state_size) rollouts_cost = RolloutStorage( num_steps, NUM_PROC, inst_obs_shape, inst_action_space, actor_critic_instinct.recurrent_hidden_state_size) obs = envs.reset() i_obs = torch.cat( [obs, torch.zeros((NUM_PROC, envs.action_space.shape[0]))], dim=1) # Add zero action to the observation rollouts_rewards.obs[0].copy_(obs) rollouts_rewards.to(device) rollouts_cost.obs[0].copy_(i_obs) rollouts_cost.to(device) fitnesses = [] best_fitness_so_far = float("-Inf") is_instinct_training = False for j in range(num_updates): is_instinct_training_old = is_instinct_training is_instinct_training = phase_shifter( j, PHASE_LENGTH, len(TrainPhases)) == TrainPhases.INSTINCT_TRAIN_PHASE.value is_instinct_deterministic = not is_instinct_training is_policy_deterministic = not is_instinct_deterministic for step in range(num_steps): # Sample actions with torch.no_grad(): # (value, action, action_log_probs, rnn_hxs), (instinct_value, instinct_action, instinct_outputs_log_prob, i_rnn_hxs), final_action value, action, action_log_probs, recurrent_hidden_states = actor_critic_policy.act( rollouts_rewards.obs[step], rollouts_rewards.recurrent_hidden_states[step], rollouts_rewards.masks[step], deterministic=is_policy_deterministic) instinct_value, instinct_action, instinct_outputs_log_prob, instinct_recurrent_hidden_states = actor_critic_instinct.act( rollouts_cost.obs[step], rollouts_cost.recurrent_hidden_states[step], rollouts_cost.masks[step], deterministic=is_instinct_deterministic, ) # Combine two networks final_action, i_control = policy_instinct_combinator( action, instinct_action) obs, reward, done, infos = envs.step(final_action) # envs.render() reward, violation_cost = reward_cost_combinator( reward, infos, NUM_PROC, i_control) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts_rewards.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, bad_masks) i_obs = torch.cat([obs, action], dim=1) rollouts_cost.insert(i_obs, instinct_recurrent_hidden_states, instinct_action, instinct_outputs_log_prob, instinct_value, violation_cost, masks, bad_masks) with torch.no_grad(): next_value_policy = actor_critic_policy.get_value( rollouts_rewards.obs[-1], rollouts_rewards.recurrent_hidden_states[-1], rollouts_rewards.masks[-1]).detach() next_value_instinct = actor_critic_instinct.get_value( rollouts_cost.obs[-1], rollouts_cost.recurrent_hidden_states[-1], rollouts_cost.masks[-1].detach()) rollouts_rewards.compute_returns(next_value_policy, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) rollouts_cost.compute_returns(next_value_instinct, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if not is_instinct_training: print("training policy") # Policy training phase p_before = deepcopy(agent_instinct.actor_critic) value_loss, action_loss, dist_entropy = agent_policy.update( rollouts_rewards) val_loss_i, action_loss_i, dist_entropy_i = 0, 0, 0 p_after = deepcopy(agent_instinct.actor_critic) assert compare_two_models( p_before, p_after), "policy changed when it shouldn't" else: print("training instinct") # Instinct training phase value_loss, action_loss, dist_entropy = 0, 0, 0 p_before = deepcopy(agent_policy.actor_critic) val_loss_i, action_loss_i, dist_entropy_i = agent_instinct.update( rollouts_cost) p_after = deepcopy(agent_policy.actor_critic) assert compare_two_models( p_before, p_after), "policy changed when it shouldn't" rollouts_rewards.after_update() rollouts_cost.after_update() ob_rms = utils.get_vec_normalize(envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms fits, info = evaluate(EvalActorCritic(actor_critic_policy, actor_critic_instinct), ob_rms, eval_envs, NUM_PROC, reward_cost_combinator, device, instinct_on=inst_on, visualise=visualize) instinct_reward = info['instinct_reward'] eval_hazard_collisions = info['hazard_collisions'] print( f"Step {j}, Fitness {fits.item()}, value_loss = {value_loss}, action_loss = {action_loss}, " f"dist_entropy = {dist_entropy}") print( f"Step {j}, Instinct reward {instinct_reward}, value_loss instinct = {val_loss_i}, action_loss instinct= {action_loss_i}, " f"dist_entropy instinct = {dist_entropy_i} hazard_collisions = {eval_hazard_collisions}" ) print( "-----------------------------------------------------------------" ) # Tensorboard logging log_writer.add_scalar("fitness", fits.item(), j) log_writer.add_scalar("value loss", value_loss, j) log_writer.add_scalar("action loss", action_loss, j) log_writer.add_scalar("dist entropy", dist_entropy, j) log_writer.add_scalar("cost/instinct_reward", instinct_reward, j) log_writer.add_scalar("cost/hazard_collisions", eval_hazard_collisions, j) log_writer.add_scalar("value loss instinct", val_loss_i, j) log_writer.add_scalar("action loss instinct", action_loss_i, j) log_writer.add_scalar("dist entropy instinct", dist_entropy_i, j) fitnesses.append(fits) if fits.item() > best_fitness_so_far: best_fitness_so_far = fits.item() torch.save(actor_critic_policy, join(save_dir, "model_rl_policy.pt")) torch.save(actor_critic_instinct, join(save_dir, "model_rl_instinct.pt")) if is_instinct_training != is_instinct_training_old: torch.save(actor_critic_policy, join(save_dir, f"model_rl_policy_update_{j}.pt")) torch.save(actor_critic_instinct, join(save_dir, f"model_rl_instinct_update_{j}.pt")) torch.save(actor_critic_policy, join(save_dir, "model_rl_policy_latest.pt")) torch.save(actor_critic_instinct, join(save_dir, "model_rl_instinct_latest.pt")) return (fitnesses[-1]), 0, 0
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args_dir, logs_dir, models_dir, samples_dir = get_all_save_paths( args, 'pretrain', combine_action=args.combine_action) eval_log_dir = logs_dir + "_eval" utils.cleanup_log_dir(logs_dir) utils.cleanup_log_dir(eval_log_dir) _, _, intrinsic_models_dir, _ = get_all_save_paths(args, 'learn_reward', load_only=True) if args.load_iter != 'final': intrinsic_model_file_name = os.path.join( intrinsic_models_dir, args.env_name + '_{}.pt'.format(args.load_iter)) else: intrinsic_model_file_name = os.path.join( intrinsic_models_dir, args.env_name + '.pt'.format(args.load_iter)) intrinsic_arg_file_name = os.path.join(args_dir, 'command.txt') # save args to arg_file with open(intrinsic_arg_file_name, 'w') as f: json.dump(args.__dict__, f, indent=2) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, logs_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) else: raise NotImplementedError if args.use_intrinsic: obs_shape = envs.observation_space.shape if len(obs_shape) == 3: action_dim = envs.action_space.n elif len(obs_shape) == 1: action_dim = envs.action_space.shape[0] if 'NoFrameskip' in args.env_name: file_name = os.path.join( args.experts_dir, "trajs_ppo_{}.pt".format( args.env_name.split('-')[0].replace('NoFrameskip', '').lower())) else: file_name = os.path.join( args.experts_dir, "trajs_ppo_{}.pt".format(args.env_name.split('-')[0].lower())) rff = RewardForwardFilter(args.gamma) intrinsic_rms = RunningMeanStd(shape=()) if args.intrinsic_module == 'icm': print('Loading pretrained intrinsic module: %s' % intrinsic_model_file_name) inverse_model, forward_dynamics_model, encoder = torch.load( intrinsic_model_file_name) icm = IntrinsicCuriosityModule(envs, device, inverse_model, forward_dynamics_model, \ inverse_lr=args.intrinsic_lr, forward_lr=args.intrinsic_lr,\ ) if args.intrinsic_module == 'vae': print('Loading pretrained intrinsic module: %s' % intrinsic_model_file_name) vae = torch.load(intrinsic_model_file_name) icm = GenerativeIntrinsicRewardModule(envs, device, \ vae, lr=args.intrinsic_lr, \ ) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) next_obs = obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, next_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.use_intrinsic: for step in range(args.num_steps): state = rollouts.obs[step] action = rollouts.actions[step] next_state = rollouts.next_obs[step] if args.intrinsic_module == 'icm': state = encoder(state) next_state = encoder(next_state) with torch.no_grad(): rollouts.rewards[ step], pred_next_state = icm.calculate_intrinsic_reward( state, action, next_state, args.lambda_true_action) if args.standardize == 'True': buf_rews = rollouts.rewards.cpu().numpy() intrinsic_rffs = np.array( [rff.update(rew) for rew in buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments( intrinsic_rffs.ravel()) intrinsic_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) mean = intrinsic_rms.mean std = np.asarray(np.sqrt(intrinsic_rms.var)) rollouts.rewards = rollouts.rewards / torch.from_numpy(std).to( device) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(models_dir, args.algo) policy_file_name = os.path.join(save_path, args.env_name + '.pt') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], policy_file_name) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "{} Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(args.env_name, j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() if comet_loaded and len(args.comet) > 0: comet_credentials = args.comet.split("/") experiment = Experiment(api_key=comet_credentials[2], project_name=comet_credentials[1], workspace=comet_credentials[0]) for key, value in vars(args).items(): experiment.log_parameter(key, value) else: experiment = None torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, args.custom_gym, args.navi) base = None if args.navi: base = NaviBase obs_shape = envs.observation_space.shape actor_critic = Policy( obs_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, navi=args.navi, base=base, ) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'random': agent = algo.RANDOM_AGENT(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) actor_critic = RandomPolicy( obs_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, navi=args.navi, base=base, ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode_length = deque(maxlen=10) episode_success_rate = deque(maxlen=100) episode_total = 0 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) print("args.num_steps: " + str(args.num_steps)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for idx, info in enumerate(infos): if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) if "Pacman" not in args.env_name: episode_success_rate.append( info['was_successful_trajectory']) episode_total += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if experiment is not None: experiment.log_metric("Reward Mean", np.mean(episode_rewards), step=total_num_steps) experiment.log_metric("Reward Min", np.min(episode_rewards), step=total_num_steps) experiment.log_metric("Reward Max", np.max(episode_rewards), step=total_num_steps) experiment.log_metric("Episode Length Mean ", np.mean(episode_length), step=total_num_steps) experiment.log_metric("Episode Length Min", np.min(episode_length), step=total_num_steps) experiment.log_metric("Episode Length Max", np.max(episode_length), step=total_num_steps) experiment.log_metric("# Trajectories (Total)", j, step=total_num_steps) if "Pacman" not in args.env_name: experiment.log_metric("Episodic Success Rate", np.mean(episode_success_rate), step=total_num_steps) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): #wandb.run = config.tensorboard.run wandb.init(settings=wandb.Settings(start_method="fork"), project='growspaceenv_baselines', entity='growspace') #torch.manual_seed(config.seed) #torch.cuda.manual_seed_all(config.seed) if config.cuda and torch.cuda.is_available() and config.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(config.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if config.cuda else "cpu") envs = make_vec_envs(config.env_name, config.seed, config.num_processes, config.gamma, config.log_dir, device, False, config.custom_gym) if "Mnist" in config.env_name: base = 'Mnist' else: base = None actor_critic = Policy(envs.observation_space.shape, envs.action_space, base, base_kwargs={'recurrent': config.recurrent_policy}) actor_critic.to(device) if config.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm) elif config.algo == 'ppo': agent = algo.PPO(actor_critic, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm, optimizer=config.optimizer, momentum=config.momentum) elif config.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, config.value_loss_coef, config.entropy_coef, acktr=True) if config.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( config.gail_experts_dir, "trajs_{}.pt".format(config.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > config.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=config.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(config.num_steps, config.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] episode_length = [] episode_branches = [] episode_branch1 = [] episode_branch2 = [] episode_light_width = [] episode_light_move = [] episode_success = [] episode_plantpixel = [] start = time.time() num_updates = int( config.num_env_steps) // config.num_steps // config.num_processes x = 0 action_space_type = envs.action_space for j in range(num_updates): if isinstance(action_space_type, Discrete): action_dist = np.zeros(envs.action_space.n) total_num_steps = (j + 1) * config.num_processes * config.num_steps if config.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if config.algo == "acktr" else config.lr) #new_branches = [] for step in range(config.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) if isinstance(action_space_type, Discrete): action_dist[action] += 1 for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) wandb.log({"Episode_Reward": info['episode']['r']}, step=total_num_steps) if 'new_branches' in info.keys(): episode_branches.append(info['new_branches']) if 'new_b1' in info.keys(): episode_branch1.append(info['new_b1']) if 'new_b2' in info.keys(): episode_branch2.append(info['new_b2']) if 'light_width' in info.keys(): episode_light_width.append(info['light_width']) if 'light_move' in info.keys(): episode_light_move.append(info['light_move']) if 'success' in info.keys(): episode_success.append(info['success']) if 'plant_pixel' in info.keys(): episode_plantpixel.append(info['plant_pixel']) if j == x: if 'img' in info.keys(): img = info['img'] path = './hittiyas/growspaceenv_braselines/scripts/imgs/' cv2.imwrite( os.path.join(path, 'step' + str(step) + '.png'), img) x += 1000 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if config.gail: if j >= 10: envs.venv.eval() gail_epoch = config.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(config.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], config.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.gae_lambda, config.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % config.save_interval == 0 or j == num_updates - 1) and config.save_dir != "": save_path = os.path.join(config.save_dir, config.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, config.env_name + ".pt")) if j % config.log_interval == 0 and len(episode_rewards) > 1: if isinstance(action_space_type, Discrete): np_hist = np.histogram(np.arange(action_dist.shape[0]), weights=action_dist) wandb.log( { "Discrete Actions": wandb.Histogram(np_histogram=np_hist) }, step=total_num_steps) wandb.log({"Reward Min": np.min(episode_rewards)}, step=total_num_steps) wandb.log({"Summed Reward": np.sum(episode_rewards)}, step=total_num_steps) wandb.log({"Reward Mean": np.mean(episode_rewards)}, step=total_num_steps) wandb.log({"Reward Max": np.max(episode_rewards)}, step=total_num_steps) wandb.log( {"Number of Mean New Branches": np.mean(episode_branches)}, step=total_num_steps) wandb.log({"Number of Max New Branches": np.max(episode_branches)}, step=total_num_steps) wandb.log({"Number of Min New Branches": np.min(episode_branches)}, step=total_num_steps) wandb.log( { "Number of Mean New Branches of Plant 1": np.mean(episode_branch1) }, step=total_num_steps) wandb.log( { "Number of Mean New Branches of Plant 2": np.mean(episode_branch2) }, step=total_num_steps) wandb.log( { "Number of Total Displacement of Light": np.sum(episode_light_move) }, step=total_num_steps) wandb.log({"Mean Light Displacement": episode_light_move}, step=total_num_steps) wandb.log({"Mean Light Width": episode_light_width}, step=total_num_steps) wandb.log( { "Number of Steps in Episode with Tree is as close as possible": np.sum(episode_success) }, step=total_num_steps) wandb.log({"Entropy": dist_entropy}, step=total_num_steps) wandb.log( { "Displacement of Light Position": wandb.Histogram(episode_light_move) }, step=total_num_steps) wandb.log( { "Displacement of Beam Width": wandb.Histogram(episode_light_width) }, step=total_num_steps) wandb.log({"Mean Plant Pixel": np.mean(episode_plantpixel)}, step=total_num_steps) wandb.log({"Summed Plant Pixel": np.sum(episode_plantpixel)}, step=total_num_steps) wandb.log( {"Plant Pixel Histogram": wandb.Histogram(episode_plantpixel)}, step=total_num_steps) episode_rewards.clear() episode_length.clear() episode_branches.clear() episode_branch2.clear() episode_branch1.clear() episode_light_move.clear() episode_light_width.clear() episode_success.clear() episode_plantpixel.clear() if (config.eval_interval is not None and len(episode_rewards) > 1 and j % config.eval_interval == 0): ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) evaluate(actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, config.custom_gym) ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) evaluate(actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, config.custom_gym, gif=True)
def main(): args = get_args() args.env_name = "Torcs-v1" args.algo = 'ppo' args.use_gae = True args.log_interval = 1 args.num_steps = 2048 args.num_processes = 1 args.lr = 3e-4 args.entropy_coef = 0 args.value_loss_coef = 0.5 args.ppo_epoch = 10 args.num_mini_batch = 32 args.gamma = 0.99 args.gae_lambda = 0.95 args.num_env_steps = 1000000 args.use_linear_lr_decay = True args.use_proper_time_limits = True args.save_dir = "saved" args.seed = 0 args.cuda = False torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, device, False) # envs = gym.make(args.env_name) # envs.seed(args.seed) actor_critic = Policy(24, 3, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) # actor_critic = torch.load("saved/ppo/Torcs-v0_new_mp.pt") # print(actor_critic) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) # rollouts = RolloutStorage(args.num_steps, args.num_processes, # envs.observation_space.shape, envs.action_space, # actor_critic.recurrent_hidden_state_size) # obs = envs.reset() # obs = torch.from_numpy(obs) # rollouts.obs[0].copy_(obs) # rollouts.to(device) acc_r = 0 episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes done = [False] for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) print(j, "update") os.system("pkill torcs") p_job = partial(job, args=args, device=device, shared_model=actor_critic) pool = mp.Pool() res = pool.map(p_job, range(12)) pool.close() pool.join() for rollouts in res: with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # print(getattr(utils.get_vec_normalize(envs), 'ob_rms', None)) torch.save( [ actor_critic #,getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_new_mp.pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() s = "{},{:.2f},{:.2f}\n".format(j, np.mean(episode_rewards), np.median(episode_rewards)) with open("logs/{}_new_mp.csv".format(args.env_name), 'a') as fl: fl.write(s) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def train(args): torch.manual_seed(args.seed) torch.set_num_threads(1) device = torch.device('cpu') os.makedirs(args.save_dir, exist_ok=True) training_log_path = os.path.join(args.save_dir, 'logs.txt') fp_log = open(training_log_path, 'w') fp_log.close() envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, None, device, False, args=args) render_env = gym.make(args.env_name, args=args) render_env.seed(args.seed) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) else: raise NotImplementedError rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode_lens = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_lens.append(info['episode']['l']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": model_save_dir = os.path.join(args.save_dir, 'models') os.makedirs(model_save_dir, exist_ok=True) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( model_save_dir, args.env_name + '_iter{}'.format(j) + ".pt")) # save logs of every episode fp_log = open(training_log_path, 'a') total_num_steps = (j + 1) * args.num_processes * args.num_steps len_mean, len_min, len_max = np.mean(episode_lens), np.min( episode_lens), np.max(episode_lens) reward_mean, reward_min, reward_max = np.mean(episode_rewards), np.min( episode_rewards), np.max(episode_rewards) fp_log.write( 'iterations: {}, mean(len): {:.1f}, min(len): {}, max(len): {}, mean(reward): {:.3f}, min(reward): {:.3f}, max(reward): {:.3f}, value_loss: {:.3f}, action_loss: {:.3f}\n' .format(total_num_steps, len_mean, len_min, len_max, reward_mean, reward_min, reward_max, value_loss, action_loss)) fp_log.close() # logging to console if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {}, time {} minutes \n Last {} training episodes: mean/median length {:1f}/{}, min/max length {}/{} mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), (end - start) / 60., len(episode_rewards), np.mean(episode_lens), np.median(episode_lens), np.min(episode_lens), np.max(episode_lens), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(args, actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, device) if (args.render_interval is not None and args.render_interval > 0 and j % args.render_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms render(render_env, actor_critic, ob_rms, deterministic=True) render_env.close() envs.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) save_name = '%s_%s' % (args.env_name, args.algo) if args.postfix != '': save_name += ('_' + args.postfix) logger_filename = os.path.join(log_dir, save_name) logger = utils.create_logger(logger_filename) torch.set_num_threads(1) device = torch.device("cuda:%d" % args.gpu if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, 4, obs_type="grid" if args.grid else "image", skip_frames=args.num_skip_frames) if args.load_dir != None: actor_critic, ob_rms = \ torch.load(os.path.join(args.load_dir), map_location=lambda storage, loc: storage) vec_norm = utils.get_vec_normalize(envs) if vec_norm is not None: vec_norm.ob_rms = ob_rms print("load pretrained...") else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base="grid" if args.grid else None, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) lines = deque(maxlen=10) start = time.time() kk = 0 num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes # learning_start = 1000 learning_start = 0 best_reward = -100 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) explore = exploration_rate(j - learning_start, 'exp') # print(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # if j < learning_start: # action[0, 0] = random.randint(0, envs.action_space.n - 1) # elif random.uniform(0, 1) < explore: # action[0, 0] = random.randint(0, envs.action_space.n - 1) # else: # pass # Obser reward and next obs # action[0, 0] = 1 # envs.take_turns() obs, reward, done, infos = envs.step(action) # print(obs) # im = Image.fromarray(obs[0].reshape(224 * 4, -1).cpu().numpy().astype(np.uint8)) # im.save("samples/%d.png" % kk) # kk += 1 # info = infos[0] # if len(info) > 0: # print(info) # print(done) # print(infos) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if 'sent' in info.keys(): lines.append(info['sent']) # kk += 1 # print(action.shape) # print(obs.shape) # print(done.shape) # if done[0]: # print(time.time() - start) # print(kk) # exit() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "" \ and np.mean(episode_rewards) > best_reward: save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass best_reward = np.mean(episode_rewards) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, save_name + ".pt")) # print(episode_rewards) if j % args.log_interval == 0 and len(episode_rewards) > 1: if j < learning_start: logger.info("random action") total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() logger.info( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) logger.info( ' lines sent: mean/median lines {:.1f}/{:.1f}, min/max lines {:.1f}/{:.1f}\n' .format(np.mean(lines), np.median(lines), np.min(lines), np.max(lines))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, dimh=args.dimh) actor_critic.to(device) exp_name = "%s_%s_seed%d_dimh%d_" % (args.env_name, args.algo, args.seed, args.dimh) if args.gail: exp_name += '_gail_' if args.split: exp_name += 'splitevery' + str(args.split_every) if args.random_split: exp_name += '_rsplit' else: exp_name += 'baseline' writer = SummaryWriter('./runs/' + exp_name) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) stats = { 'seed': args.seed, 'experiment': exp_name, 'env': args.env_name, 'dimh': args.dimh, 'split every': args.split_every, 'random split': args.random_split, 'steps': [], 'mean reward': [], 'actor neurons': [], 'critic neurons': [], } save_dir = './experiment_results/%s/' % args.env_name stats_save_path = save_dir + exp_name check_path(save_dir) print('start') count = -1 num_updates = 488 * 2 meanreward = [] for j in range(num_updates): #if j % 50 == 0: # print('STEP', j) if args.use_linear_lr_decay: # decrease learning rate linearly count += 1 if j % 488 == 0: count = 0 total = 488 * 2 else: total = 488 * 2 if args.split: utils.update_linear_schedule( agent.optimizer, count, total, agent.optimizer.lr if args.algo == "acktr" else args.lr) else: utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) # splitting if args.split and (j + 1) % args.split_every == 0 and j < 200: print("[INFO] split on iteration %d..." % j) agent.split(rollouts, args.random_split) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) meanreward.append(np.mean(episode_rewards)) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if True: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) stats['mean reward'].append(np.mean(episode_rewards)) stats['steps'].append(j) if args.split: a, c = agent.actor_critic.get_num_params() stats['actor neurons'].append(a) stats['critic neurons'].append(c) if j % 10 == 0: print("[INFO] saving to ", stats_save_path) np.save(stats_save_path, stats) if j % 5 == 0: s = (j + 1) * args.num_processes * args.num_steps if args.split: a, c = agent.actor_critic.get_num_params() writer.add_scalar('A neurons', a, s) writer.add_scalar('C neurons', c, s) writer.add_scalar('mean reward', np.mean(episode_rewards), s) writer.add_scalar('entropy loss', dist_entropy, s) writer.add_scalar('value loss', value_loss, s) writer.add_scalar('action loss', action_loss, s) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) writer.close() import pickle pickle.dump(meanreward, open(stats_save_path + '.pkl', 'wb'))
def main(): # 引数の読み取り args = get_args() # 乱数のシード値を決定 torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # cudaの設定(trueの場合決定論的振る舞いをする-つまりlossの値がばらつかなくなる) # ただし、arguments.pyで以下のように記されている # Sets flags for determinism when using CUDA (potentially slow!) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # logファイルの出力先を決定 log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) # スレッド数の決定 torch.set_num_threads(1) # deviceの設定 device = torch.device("cuda:0" if args.cuda else "cpu") # 環境の初期化 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) # ポリシーの設定 actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) # to(device) デバイスの設定: GPU or CPU actor_critic.to(device) # アルゴリズムの選択 if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) # Rollout(報酬の評価)の初期化 rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) # 環境の初期化 obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # 報酬の定義 episode_rewards = deque(maxlen=10) # 開始時刻とパラメータを更新する回数?を設定 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # 学習率にリニアな線形スケジュールを使用する場合 if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) # アクションの for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(base=IAMBase, num_frame_stack=None): seed = 1 env_name = "Warehouse-v0" num_processes = 32 log_dir = './logs/' eval_interval = None log_interval = 10 use_linear_lr_decay = False use_proper_time_limits = False save_dir = './trained_models/' use_cuda = True # PPO gamma = 0.99 # reward discount factor clip_param = 0.1 #0.2 ppo_epoch = 3 #4 num_mini_batch = 32 value_loss_coef = 1 #0.5 entropy_coef = 0.01 lr = 2.5e-4 #7e-4 eps = 1e-5 max_grad_norm = float('inf') use_gae = True gae_lambda = 0.95 num_steps = 8 #5 # Store num_env_steps = 4e6 save_interval = 100 # IAM dset = [ 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72 ] #gym.envs.register(env_name, entry_point="environments.warehouse.warehouse:Warehouse", # kwargs={'seed': seed, 'parameters': {"num_frames": 1}}) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) log_dir = os.path.expanduser(log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if use_cuda else "cpu") envs = make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, False, num_frame_stack=num_frame_stack) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=base, base_kwargs=({ 'dset': dset } if base == IAMBase else {})) actor_critic.to(device) agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=lr, eps=eps, max_grad_norm=max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(num_env_steps) // num_steps // num_processes for j in range(num_updates): if use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, lr) for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda, use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % save_interval == 0 or j == num_updates - 1) and save_dir != "": save_path = os.path.join(save_dir, 'PPO') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, env_name + ".pt")) if j % log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * num_processes * num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (eval_interval is not None and len(episode_rewards) > 1 and j % eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, env_name, seed, num_processes, eval_log_dir, device)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print(device) print(save_folder) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, args.reward_type) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) curiosity = None if use_curiosity: curiosity = ICM(envs.observation_space.shape[0], envs.action_space.n) curiosity.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, curiosity, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_curiosity=use_curiosity) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() cum_rew = [0] * args.num_processes rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=args.num_processes * 2) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) envs.render() cur_reward = reward to_write = reward.cpu().numpy() for i in range(args.num_processes): cum_rew[i] += to_write[i][0] if use_curiosity: action_one_hot = (torch.eye(14)[action]).view(-1, 14).cuda() _, pred_phi, actual_phi = curiosity( (rollouts.obs[step], obs, action_one_hot)) cur_reward += 0.2 * ((pred_phi - actual_phi).pow(2)).sum( -1, keepdim=True).cpu() / 2 for i, finished in enumerate(done): if finished: percentile = infos[i]['x_pos'] / norm_pos episode_rewards.append(percentile) print(cum_rew[i]) with open(train_file[:-4] + str(i) + train_file[-4:], 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([[cum_rew[i], percentile]]) cum_rew[i] = 0 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, cur_reward.detach(), masks) with torch.no_grad(): next_value = agent.actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = agent.actor_critic if args.cuda: save_model = copy.deepcopy(agent.actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_folder, '/' + args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len( episode_rewards) > args.num_processes: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, cumulative reward {:.3f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.mean(cum_rew))) #Evaluation time : if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): num_proc = 1 eval_envs = make_vec_envs(args.env_name, args.seed + num_proc, num_proc, args.gamma, args.log_dir, args.add_timestep, device, True, args.reward_type) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] test_rew = 0 finish_this = False obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_proc, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_proc, 1, device=device) positions = deque(maxlen=400) while not finish_this: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = agent.actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_envs.render() eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).cuda() # for i, finished in enumerate(done): # if finished: # percentile = infos[i]['x_pos']/norm_pos # eval_episode_rewards.append(percentile) # with open(eval_file, 'a', newline='') as sfile: # writer = csv.writer(sfile) # writer.writerows([[percentile]]) test_rew += reward.cpu().numpy()[0, 0] for i, finished in enumerate(done): if finished: print('he died') percentile = infos[i]['x_pos'] / norm_pos eval_episode_rewards.append(percentile) with open(eval_file, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([[test_rew, percentile]]) finish_this = True #to prevent the agent from getting stuck positions.append(infos[0]['x_pos']) pos_ar = np.array(positions) if (len(positions) >= 200) and (pos_ar < pos_ar[-1] + 20).all( ) and (pos_ar > pos_ar[-1] - 20).all(): print("he's stuck") percentile = infos[0]['x_pos'] / norm_pos eval_episode_rewards.append(percentile) with open(eval_file, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([[test_rew, percentile]]) finish_this = True eval_envs.close() positions.clear() print( " Evaluation using {} episodes: reward {:.3f}, distance {:.3f}\n" .format(len(eval_episode_rewards), test_rew, np.mean(eval_episode_rewards))) test_rew = 0 finish_this = False if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:" + str(args.cuda_id) if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ########## file related filename = args.env_name + "_" + args.algo + "_n" + str(args.max_episodes) if args.attack: filename += "_" + args.type + "_" + args.aim filename += "_s" + str(args.stepsize) + "_m" + str( args.maxiter) + "_r" + str(args.radius) + "_f" + str(args.frac) if args.run >= 0: filename += "_run" + str(args.run) logger = get_log(args.logdir + filename + "_" + current_time) logger.info(args) rew_file = open(args.resdir + filename + ".txt", "w") if args.compute: radius_file = open( args.resdir + filename + "_radius" + "_s" + str(args.stepsize) + "_m" + str(args.maxiter) + "_th" + str(args.dist_thres) + ".txt", "w") if args.type == "targ" or args.type == "fgsm": targ_file = open(args.resdir + filename + "_targ.txt", "w") num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes if args.type == "wb": attack_net = WbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device=device) if args.type == "bb": attack_net = BbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device=device) elif args.type == "rand": attack_net = RandAttacker(envs, radius=args.radius, frac=args.frac, maxat=int(args.frac * num_updates), device=device) elif args.type == "semirand": attack_net = WbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device, rand_select=True) elif args.type == "targ": if isinstance(envs.action_space, Discrete): action_dim = envs.action_space.n target_policy = action_dim - 1 elif isinstance(envs.action_space, Box): action_dim = envs.action_space.shape[0] target_policy = torch.zeros(action_dim) # target_policy[-1] = 1 print("target policy is", target_policy) attack_net = TargAttacker(agent, envs, int(args.frac * num_updates), num_updates, target_policy, args, device=device) elif args.type == "fgsm": if isinstance(envs.action_space, Discrete): action_dim = envs.action_space.n target_policy = action_dim - 1 elif isinstance(envs.action_space, Box): action_dim = envs.action_space.shape[0] target_policy = torch.zeros(action_dim) def targ_policy(obs): return target_policy attack_net = FGSMAttacker(envs, agent, targ_policy, radius=args.radius, frac=args.frac, maxat=int(args.frac * num_updates), device=device) # if args.aim == "obs" or aim == "hybrid": # obs_space = gym.make(args.env_name).observation_space # attack_net.set_obs_range(obs_space.low, obs_space.high) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode = 0 start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions if args.type == "fgsm": # print("before", rollouts.obs[step]) rollouts.obs[step] = attack_net.attack( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).clone() # print("after", rollouts.obs[step]) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.type == "targ" or args.type == "fgsm": if isinstance(envs.action_space, Discrete): num_target = ( action == target_policy).nonzero()[:, 0].size()[0] targ_file.write( str(num_target / args.num_processes) + "\n") print("percentage of target:", num_target / args.num_processes) elif isinstance(envs.action_space, Box): target_action = target_policy.repeat(action.size()[0], 1) targ_file.write( str( torch.norm(action - target_action).item() / args.num_processes) + "\n") print("percentage of target:", torch.sum(action).item() / args.num_processes) # Obser reward and next obs obs, reward, done, infos = envs.step(action.cpu()) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # rew_file.write("episode: {}, total reward: {}\n".format(episode, info['episode']['r'])) episode += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) if args.attack and args.type != "fgsm": if args.aim == "reward": logger.info(rollouts.rewards.flatten()) rollouts.rewards = attack_net.attack_r_general( rollouts, next_value).clone().detach() logger.info("after attack") logger.info(rollouts.rewards.flatten()) elif args.aim == "obs": origin = rollouts.obs.clone() rollouts.obs = attack_net.attack_s_general( rollouts, next_value).clone().detach() logger.info(origin) logger.info("after") logger.info(rollouts.obs) elif args.aim == "action": origin = torch.flatten(rollouts.actions).clone() rollouts.actions = attack_net.attack_a_general( rollouts, next_value).clone().detach() logger.info("attack value") logger.info(torch.flatten(rollouts.actions) - origin) elif args.aim == "hybrid": res_aim, attack = attack_net.attack_hybrid( rollouts, next_value, args.radius_s, args.radius_a, args.radius_r) print("attack ", res_aim) if res_aim == "obs": origin = rollouts.obs.clone() rollouts.obs = attack.clone().detach() logger.info(origin) logger.info("attack obs") logger.info(rollouts.obs) elif res_aim == "action": origin = torch.flatten(rollouts.actions).clone() rollouts.actions = attack.clone().detach() logger.info("attack action") logger.info(torch.flatten(rollouts.actions) - origin) elif res_aim == "reward": logger.info(rollouts.rewards.flatten()) rollouts.rewards = attack.clone().detach() logger.info("attack reward") logger.info(rollouts.rewards.flatten()) if args.compute: stable_radius = attack_net.compute_radius(rollouts, next_value) print("stable radius:", stable_radius) radius_file.write("update: {}, radius: {}\n".format( j, np.round(stable_radius, decimals=3))) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.attack and args.type == "bb": attack_net.learning(rollouts) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) >= 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) rew_file.write("updates: {}, mean reward: {}\n".format( j, np.mean(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) # if episode > args.max_episodes: # print("reach episodes limit") # break if args.attack: logger.info("total attacks: {}\n".format(attack_net.attack_num)) print("total attacks: {}\n".format(attack_net.attack_num)) rew_file.close() if args.compute: radius_file.close() if args.type == "targ" or args.type == "fgsm": targ_file.close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") ## Make environments envs = make_vec_envs(args, device) ## Setup Policy / network architecture if args.load_path != '': if os.path.isfile(os.path.join(args.load_path, "best_model.pt")): import_name = "best_model.pt" else: import_name = "model.pt" online_actor_critic = torch.load( os.path.join(args.load_path, import_name)) target_actor_critic = torch.load( os.path.join(args.load_path, import_name)) if args.cuda: target_actor_critic = target_actor_critic.cuda() online_actor_critic = online_actor_critic.cuda() else: online_actor_critic = Policy(occ_obs_shape, sign_obs_shape, args.state_rep, envs.action_space, args.recurrent_policy) online_actor_critic.to(device) target_actor_critic = Policy(occ_obs_shape, sign_obs_shape, args.state_rep, envs.action_space, args.recurrent_policy) target_actor_critic.to(device) target_actor_critic.load_state_dict(online_actor_critic.state_dict()) if args.penetration_type == "constant": target_actor_critic = online_actor_critic ## Choose algorithm to use if args.algo == 'a2c': agent = algo.A2C_ACKTR(online_actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(online_actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(online_actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) ## Initiate memory buffer rollouts = RolloutStorage(args.num_steps, args.num_processes, occ_obs_shape, sign_obs_shape, envs.action_space, target_actor_critic.recurrent_hidden_state_size) ## Start env with first observation occ_obs, sign_obs = envs.reset() if args.state_rep == 'full': rollouts.occ_obs[0].copy_(occ_obs) rollouts.sign_obs[0].copy_(sign_obs) rollouts.to(device) # Last 20 rewards - can set different queue length for different averaging episode_rewards = deque(maxlen=args.num_steps) reward_track = [] best_eval_rewards = 0 start = time.time() ## Loop over every policy updatetarget network for j in range(num_updates): ## Setup parameter decays if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) ## Loop over num_steps environment updates to form trajectory for step in range(args.num_steps): # Sample actionspython3 main.py --algo ppo --num-steps 700000 --penetration-rate $i --env-name TrafficLight-simple-dense-v0 --lr 2.5e-4 --num-processes 8 --num-steps 128 --num-mini-batch 4 --use-linear-lr-decay --use-linear-clip-decay with torch.no_grad(): # Pass observation through network and get outputs value, action, action_log_prob, recurrent_hidden_states = target_actor_critic.act( rollouts.occ_obs[step], rollouts.sign_obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Do action in environment and save reward occ_obs, sign_obs, reward, done, _ = envs.step(action) episode_rewards.append(reward.numpy()) # Masks the processes which are done masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) # Insert step information in buffer rollouts.insert(occ_obs, sign_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) ## Get state value of current env state with torch.no_grad(): next_value = target_actor_critic.get_value( rollouts.occ_obs[-1], rollouts.sign_obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() ## Computes the num_step return (next_value approximates reward after num_step) see Supp Material of https://arxiv.org/pdf/1804.02717.pdf ## Can use Generalized Advantage Estimation rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # Update the policy with the rollouts value_loss, action_loss, dist_entropy = agent.update(rollouts) # Clean the rollout by cylcing last elements to first ones rollouts.after_update() if (args.penetration_type == "linear") and (j % update_period == 0): target_actor_critic.load_state_dict( online_actor_critic.state_dict()) ## Save model}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n". if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": # A really ugly way to save a model to CPU save_model = target_actor_critic if args.cuda: save_model = copy.deepcopy(target_actor_critic).cpu() torch.save(save_model, os.path.join(save_path, "model.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if args.vis: # Add the average reward of update to reward tracker reward_track.append(np.mean(episode_rewards)) ## Log progress if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy)) ## Evaluate model on new environments for 10 rewards percentage = 100 * total_num_steps // args.num_env_steps if (args.eval_interval is not None and percentage > 1 and (j % args.eval_interval == 0 or j == num_updates - 1)): print("###### EVALUATING #######") args_eval = copy.deepcopy(args) args_eval.num_processes = 1 eval_envs = make_vec_envs(args_eval, device, no_logging=True) eval_episode_rewards = [] occ_obs, sign_obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args_eval.num_processes, target_actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args_eval.num_processes, 1, device=device) while len(eval_episode_rewards) < 3000: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = target_actor_critic.act( occ_obs, sign_obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs occ_obs, sign_obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) eval_episode_rewards.append(reward) eval_envs.close() if np.mean(eval_episode_rewards) > best_eval_rewards: best_eval_rewards = np.mean(eval_episode_rewards) save_model = target_actor_critic if args.cuda: save_model = copy.deepcopy(target_actor_critic).cpu() torch.save(save_model, os.path.join(save_path, 'best_model.pt')) ## Visualize tracked rewards(over num_steps) over time if args.vis: visualize(reward_track, args.algo, save_path)
def learn(env, max_timesteps, timesteps_per_batch, clip_param): ppo_epoch = 5 num_step = timesteps_per_batch save_interval = 100 seed = 1000 batch_size = 64 torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) log_dir = os.path.expanduser('/tmp/gym/') eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda") envs = make_vec_envs(env, seed, 8, 0.95, log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = algo.PPO(actor_critic, clip_param, ppo_epoch, batch_size, 0.5, 0.01, lr=0.00025, eps=1e-05, max_grad_norm=0.5) rollouts = RolloutStorage(num_step, 8, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs)) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(max_timesteps) // num_step // 8 for j in range(num_updates): # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, 0.00025) for step in range(num_step): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, True, 0.99, 0.95, False) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % save_interval == 0 or j == num_updates - 1) and "./trained_models/" != "": save_path = os.path.join("./trained_models/", 'ppo') try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, 'UniversalPolicy' + ".pt")) if j % 1 == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * 8 * num_step end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''
def main(): args = get_args() import random random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True logdir = args.env_name + '_' + args.algo + '_num_arms_' + str( args.num_processes) + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") if args.use_privacy: logdir = logdir + '_privacy' elif args.use_noisygrad: logdir = logdir + '_noisygrad' elif args.use_pcgrad: logdir = logdir + '_pcgrad' elif args.use_testgrad: logdir = logdir + '_testgrad' elif args.use_median_grad: logdir = logdir + '_mediangrad' logdir = os.path.join('runs', logdir) logdir = os.path.join(os.path.expanduser(args.log_dir), logdir) utils.cleanup_log_dir(logdir) # Ugly but simple logging log_dict = { 'task_steps': args.task_steps, 'grad_noise_ratio': args.grad_noise_ratio, 'max_task_grad_norm': args.max_task_grad_norm, 'use_noisygrad': args.use_noisygrad, 'use_pcgrad': args.use_pcgrad, 'use_testgrad': args.use_testgrad, 'use_testgrad_median': args.use_testgrad_median, 'testgrad_quantile': args.testgrad_quantile, 'median_grad': args.use_median_grad, 'use_meanvargrad': args.use_meanvargrad, 'meanvar_beta': args.meanvar_beta, 'no_special_grad_for_critic': args.no_special_grad_for_critic, 'use_privacy': args.use_privacy, 'seed': args.seed, 'recurrent': args.recurrent_policy, 'obs_recurrent': args.obs_recurrent, 'cmd': ' '.join(sys.argv[1:]) } for eval_disp_name, eval_env_name in EVAL_ENVS.items(): log_dict[eval_disp_name] = [] summary_writer = SummaryWriter() summary_writer.add_hparams( { 'task_steps': args.task_steps, 'grad_noise_ratio': args.grad_noise_ratio, 'max_task_grad_norm': args.max_task_grad_norm, 'use_noisygrad': args.use_noisygrad, 'use_pcgrad': args.use_pcgrad, 'use_testgrad': args.use_testgrad, 'use_testgrad_median': args.use_testgrad_median, 'testgrad_quantile': args.testgrad_quantile, 'median_grad': args.use_median_grad, 'use_meanvargrad': args.use_meanvargrad, 'meanvar_beta': args.meanvar_beta, 'no_special_grad_for_critic': args.no_special_grad_for_critic, 'use_privacy': args.use_privacy, 'seed': args.seed, 'recurrent': args.recurrent_policy, 'obs_recurrent': args.obs_recurrent, 'cmd': ' '.join(sys.argv[1:]) }, {}) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print('making envs...') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, steps=args.task_steps, free_exploration=args.free_exploration, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True) val_envs = make_vec_envs(args.val_env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, steps=args.task_steps, free_exploration=args.free_exploration, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True) eval_envs_dic = {} for eval_disp_name, eval_env_name in EVAL_ENVS.items(): eval_envs_dic[eval_disp_name] = make_vec_envs( eval_env_name[0], args.seed, args.num_processes, None, logdir, device, True, steps=args.task_steps, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True, free_exploration=args.free_exploration) prev_eval_r = {} print('done') if args.hard_attn: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=MLPHardAttnBase, base_kwargs={ 'recurrent': args.recurrent_policy or args.obs_recurrent }) else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=MLPAttnBase, base_kwargs={ 'recurrent': args.recurrent_policy or args.obs_recurrent }) actor_critic.to(device) if (args.continue_from_epoch > 0) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) actor_critic_, loaded_obs_rms_ = torch.load( os.path.join( save_path, args.env_name + "-epoch-{}.pt".format(args.continue_from_epoch))) actor_critic.load_state_dict(actor_critic_.state_dict()) if args.algo != 'ppo': raise "only PPO is supported" agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, num_tasks=args.num_processes, attention_policy=False, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) val_agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.val_lr, eps=args.eps, num_tasks=args.num_processes, attention_policy=True, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) val_rollouts = RolloutStorage(args.num_steps, args.num_processes, val_envs.observation_space.shape, val_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) val_obs = val_envs.reset() val_rollouts.obs[0].copy_(val_obs) val_rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes save_copy = True for j in range(args.continue_from_epoch, args.continue_from_epoch + num_updates): # policy rollouts for step in range(args.num_steps): # Sample actions actor_critic.eval() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) actor_critic.train() # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) for k, v in info['episode'].items(): summary_writer.add_scalar( f'training/{k}', v, j * args.num_processes * args.num_steps + args.num_processes * step) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) actor_critic.eval() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() actor_critic.train() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if save_copy: prev_weights = copy.deepcopy(actor_critic.state_dict()) prev_opt_state = copy.deepcopy(agent.optimizer.state_dict()) prev_val_opt_state = copy.deepcopy( val_agent.optimizer.state_dict()) save_copy = False value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # validation rollouts for val_iter in range(args.val_agent_steps): for step in range(args.num_steps): # Sample actions actor_critic.eval() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( val_rollouts.obs[step], val_rollouts.recurrent_hidden_states[step], val_rollouts.masks[step]) actor_critic.train() # Obser reward and next obs obs, reward, done, infos = val_envs.step(action) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) val_rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) actor_critic.eval() with torch.no_grad(): next_value = actor_critic.get_value( val_rollouts.obs[-1], val_rollouts.recurrent_hidden_states[-1], val_rollouts.masks[-1]).detach() actor_critic.train() val_rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) val_value_loss, val_action_loss, val_dist_entropy = val_agent.update( val_rollouts) val_rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.env_name + "-epoch-{}.pt".format(j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) revert = False if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): actor_critic.eval() obs_rms = utils.get_vec_normalize(envs).obs_rms eval_r = {} printout = f'Seed {args.seed} Iter {j} ' for eval_disp_name, eval_env_name in EVAL_ENVS.items(): eval_r[eval_disp_name] = evaluate( actor_critic, obs_rms, eval_envs_dic, eval_disp_name, args.seed, args.num_processes, eval_env_name[1], logdir, device, steps=args.task_steps, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True, free_exploration=args.free_exploration) if eval_disp_name in prev_eval_r: diff = np.array(eval_r[eval_disp_name]) - np.array( prev_eval_r[eval_disp_name]) if eval_disp_name == 'many_arms': if np.sum(diff > 0) - np.sum( diff < 0) < args.val_improvement_threshold: print('no update') revert = True summary_writer.add_scalar(f'eval/{eval_disp_name}', np.mean(eval_r[eval_disp_name]), (j + 1) * args.num_processes * args.num_steps) log_dict[eval_disp_name].append([ (j + 1) * args.num_processes * args.num_steps, eval_r[eval_disp_name] ]) printout += eval_disp_name + ' ' + str( np.mean(eval_r[eval_disp_name])) + ' ' # summary_writer.add_scalars('eval_combined', eval_r, (j+1) * args.num_processes * args.num_steps) if revert: actor_critic.load_state_dict(prev_weights) agent.optimizer.load_state_dict(prev_opt_state) val_agent.optimizer.load_state_dict(prev_val_opt_state) else: print(printout) prev_eval_r = eval_r.copy() save_copy = True actor_critic.train() save_obj(log_dict, os.path.join(logdir, 'log_dict.pkl')) envs.close() val_envs.close() for eval_disp_name, eval_env_name in EVAL_ENVS.items(): eval_envs_dic[eval_disp_name].close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args_iko.cuda else "cpu") if args_iko.vis: from visdom import Visdom viz = Visdom(port=args_iko.port) win = None envs = make_vec_envs(args_iko.env_name, args_iko.seed, args_iko.num_processes, args_iko.gamma, args_iko.log_dir, args_iko.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_iko.recurrent_policy}) actor_critic.to(device) action_shape = 3 reward_model = RewardModel(11 * 11 * 6, 1, 64, 64) reward_model.to(device) if args_iko.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, lr=args_iko.lr, eps=args_iko.eps, alpha=args_iko.alpha, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'ppo': agent = algo.PPO(actor_critic, args_iko.clip_param, args_iko.ppo_epoch, args_iko.num_mini_batch, args_iko.value_loss_coef, args_iko.entropy_coef, args_iko.use_singh, reward_model, lr=args_iko.lr, eps=args_iko.eps, max_grad_norm=args_iko.max_grad_norm) elif args_iko.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args_iko.value_loss_coef, args_iko.entropy_coef, acktr=True) rollouts = RolloutStorage(args_iko.num_steps, args_iko.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args_iko.use_linear_lr_decay: # decrease learning rate linearly if args_iko.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args_iko.lr) if args_iko.algo == 'ppo' and args_iko.use_linear_clip_decay: agent.clip_param = args_iko.clip_param * (1 - j / float(num_updates)) reward_train = [] reward_block_penalty = [] reward_bel_gt = [] reward_bel_gt_nonlog = [] reward_infogain = [] reward_bel_ent = [] reward_hit = [] reward_dist = [] reward_inv_dist = [] for step in range(args_iko.num_steps): # Sample actions # print(step, args_iko.num_steps) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) reward_train.append(reward) # print("infos is ", infos) # reward_b.append(infos[0]['auxillary_reward']) # print("infos is ",infos[0]['auxillary_reward']) reward_block_penalty.append(infos[0]['reward_block_penalty']) reward_bel_gt.append(infos[0]['reward_bel_gt']) reward_bel_gt_nonlog.append(infos[0]['reward_bel_gt_nonlog']) reward_infogain.append(infos[0]['reward_infogain']) reward_bel_ent.append(infos[0]['reward_bel_ent']) reward_hit.append(infos[0]['reward_hit']) reward_dist.append(infos[0]['reward_dist']) reward_inv_dist.append(infos[0]['reward_inv_dist']) # print(reward) reward.to(device) reward_model.to(device) if args_iko.use_singh: # print("using learning IR") my_reward = reward_model(obs.clone().to(device), action.clone().float()).detach() my_reward.to(device) reward = reward + args_iko.singh_coef * my_reward.type( torch.FloatTensor) # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # print("infos is ",infos[0]['auxillary_reward']) # print("info is",info['episode']['r'] ) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # print("mean reward_a", np.mean(reward_train)) # print("mean reward_block_penalty", np.mean(reward_block_penalty)) # print("mean reward_bel_gt", np.mean(reward_bel_gt)) # print("mean reward_bel_gt_nonlog", np.mean(reward_bel_gt_nonlog)) # print("mean reward_infogain", np.mean(reward_infogain)) # print("mean reward_bel_ent", np.mean(reward_bel_ent)) # print("mean reward_hit", np.mean(reward_hit)) # print("mean reward_dist", np.mean(reward_dist)) # print("mean reward_inv_dist", np.mean(reward_inv_dist)) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps writer.add_scalar('mean_reward_train', np.mean(reward_train), total_num_steps) writer.add_scalar('mean_reward_block_penalty', np.mean(reward_block_penalty), total_num_steps) writer.add_scalar('mean_reward_bel_gt', np.mean(reward_bel_gt), total_num_steps) writer.add_scalar('mean_reward_bel_gt_nonlog', np.mean(reward_bel_gt_nonlog), total_num_steps) writer.add_scalar('mean_reward_infogain', np.mean(reward_infogain), total_num_steps) writer.add_scalar('mean_reward_bel_ent', np.mean(reward_bel_ent), total_num_steps) writer.add_scalar('mean_reward_hit', np.mean(reward_hit), total_num_steps) writer.add_scalar('mean_reward_dist', np.mean(reward_dist), total_num_steps) writer.add_scalar('mean_reward_inv_dist', np.mean(reward_inv_dist), total_num_steps) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_iko.use_gae, args_iko.gamma, args_iko.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args_iko.save_interval == 0 or j == num_updates - 1) and args_iko.save_dir != "": save_path = os.path.join(args_iko.save_dir, args_iko.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args_iko.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, 'ugl' + str(args_iko.use_gt_likelihood) + 'block-pen-' + str(args_iko.penalty_for_block) + '_' + 'explore-' + str(args_iko.rew_explore) + '_' + 'bel-new-' + str(args_iko.rew_bel_new) + '_' + 'bel-ent-' + str(args_iko.rew_bel_ent) + '_' + 'infogain-' + str(args_iko.rew_infogain) + '_' + 'bel-gt-nolog-' + str(args_iko.rew_bel_gt_nonlog) + '_' + 'bel-gt-' + str(args_iko.rew_bel_gt) + '_' + 'dist-' + str(args_iko.rew_dist) + '_' + 'hit-' + str(args_iko.rew_hit) + '_' + 'inv-dist-' + str(args_iko.rew_inv_dist) + args_iko.algo + ".pt")) total_num_steps = (j + 1) * args_iko.num_processes * args_iko.num_steps if j % args_iko.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("mean reward_a", np.mean(reward_a)) print("mean_reward_b", np.mean(reward_b)) # print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # len(episode_rewards), # np.mean(episode_rewards), # np.median(episode_rewards), # np.min(episode_rewards), # np.max(episode_rewards), dist_entropy, # value_loss, action_loss)) # writer.add_scalar('mean_reward', np.mean(episode_rewards), total_num_steps) # writer.add_scalar('min_reward', np.min(episode_rewards), total_num_steps) # writer.add_scalar('max_reward', np.max(episode_rewards), total_num_steps) # writer.add_scalar('success_rate', np.mean(episode_successes), total_num_steps) if (args_iko.eval_interval is not None and len(episode_rewards) > 1 and j % args_iko.eval_interval == 0): eval_envs = make_vec_envs(args_iko.env_name, args_iko.seed + args_iko.num_processes, args_iko.num_processes, args_iko.gamma, eval_log_dir, args_iko.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args_iko.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args_iko.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args_iko.vis and j % args_iko.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args_iko.log_dir, args_iko.env_name, args_iko.algo, args_iko.num_env_steps) except IOError: pass writer.close()
def main(env, scene_path): try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) save_path = os.path.join(args.save_dir, args.algo) eval_x = [] eval_y = [] torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") initial_policies = torch.load(os.path.join(args.load_dir, args.algo, args.initial_policy + ".pt")) \ if args.initial_policy else None if args.reuse_residual: residual, ob_rms, initial_policies = initial_policies else: residual = None ob_rms = None pose_estimator = torch.load(os.path.join(args.load_dir, "pe", args.pose_estimator + ".pt")) \ if args.pose_estimator else None envs = make_vec_envs(env, scene_path, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, initial_policies, pose_estimator=pose_estimator, init_control=not args.dense_ip) if args.reuse_residual: vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms base_kwargs = {'recurrent': args.recurrent_policy} base = residual.base if args.reuse_residual else None dist = residual.dist if args.reuse_residual else None actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs=base_kwargs, zero_last_layer=True, base=base, dist=dist) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, burn_in=initial_policies is not None and not args.reuse_residual) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=64) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes total_num_steps = 0 j = 0 max_succ = -1 max_mean_rew = -math.inf mean_ep_rew = -math.inf evals_without_improv = 0 start = time.time() start_update = start while (not use_metric and j < num_updates) or (use_metric and max_succ < args.trg_succ_rate): if args.eval_interval is not None and j % args.eval_interval == 0: print("Evaluating current policy...") i = 0 total_successes = 0 max_trials = 50 eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while i + args.num_processes <= max_trials: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) obs, _, dones, infos = envs.step(action) if np.all(dones): # Rigid - assumes episodes are fixed length rews = [] for info in infos: rews.append(info['rew_success']) i += args.num_processes rew = sum([int(rew > 0) for rew in rews]) total_successes += rew p_succ = (100 * total_successes / i) eval_x += [total_num_steps] eval_y += [p_succ] end = time.time() print( f"Evaluation: {total_successes} successful out of {i} episodes - " f"{p_succ:.2f}% successful. Eval length: {end - start_update}") torch.save([eval_x, eval_y], os.path.join(args.save_as + "_eval.pt")) start_update = end if p_succ > max_succ: max_succ = p_succ max_mean_rew = mean_ep_rew evals_without_improv = 0 elif mean_ep_rew > max_mean_rew: print("Unimproved success rate, higher reward") max_mean_rew = mean_ep_rew evals_without_improv = 0 else: evals_without_improv += 1 if evals_without_improv == 10 or max_succ >= args.trg_succ_rate: save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None), initial_policies ] extra = "_final" if evals_without_improv == 5 else "" torch.save( save_model, os.path.join(save_path, args.save_as + f"{extra}.pt")) break # save for every interval-th episode or for the last epoch if ((not use_metric and (j % args.save_interval == 0 or j == num_updates - 1)) or (use_metric and evals_without_improv == 0)) and args.save_dir != "": os.makedirs(save_path, exist_ok=True) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() if pose_estimator is not None: save_model = [save_model, pose_estimator, initial_policies] else: save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None), initial_policies ] torch.save(save_model, os.path.join(save_path, args.save_as + ".pt")) # torch.save(save_model, os.path.join(save_path, args.save_as + f"{j * args.num_processes * args.num_steps}.pt")) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: mean_ep_rew = np.mean(episode_rewards) if mean_ep_rew > max_mean_rew: print("Improved max mean reward") max_mean_rew = mean_ep_rew evals_without_improv = 0 end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), mean_ep_rew, np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) print("Update length: ", end - start_update) start_update = end if args.vis and (j % args.vis_interval == 0 or (not use_metric and j == num_updates - 1)): try: # Sometimes monitor doesn't properly flush the outputs visdom_plot(args.log_dir, args.save_as, args.algo, total_num_steps) except IOError: pass j += 1 if use_metric: if max_succ >= args.trg_succ_rate: print( f"Achieved greater than {args.trg_succ_rate}% success, advancing curriculum." ) else: print( f"Policy converged with max success rate < {args.trg_succ_rate}%" ) # Copy logs to permanent location so new graphs can be drawn. copy_tree(args.log_dir, os.path.join('logs', args.save_as)) envs.close() return total_num_steps
def main(): if os.path.isdir(args.load_policy): args.load_policy = find_policy(args.load_policy) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) if args.load_policy is not None: actor_critic, ob_rms = torch.load(args.load_policy) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque( maxlen=(args.num_processes if args.num_processes > 10 else 10)) start = time.time() snapshot_counter = 0 last_delete = -1 try: os.makedirs(os.path.join(args.save_dir, args.algo)) except OSError: pass log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'), 'w') for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + "epoch_{:07d}.pt".format(j))) snapshot_counter += 1 last_delete += 1 if snapshot_counter > 100: os.system('rm ' + os.path.join( save_path, args.env_name + 'epoch_{:07d}.py'.format(last_delete))) snapshot_counter -= 1 total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) print(log_info) sys.stdout.flush() log_out_file.write(log_info) log_out_file.flush() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.write( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.flush() sys.stdout.flush() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") receipts = StorageReceipt() make_env = lambda tasks: MiniWoBGraphEnvironment( base_url=os.environ.get("BASE_URL", f"file://{MINIWOB_HTML}/"), levels=tasks, level_tracker=LevelTracker(tasks), wait_ms=500, ) task = args.env_name if args.env_name == "PongNoFrameskip-v4": args.env_name = "clickbutton" task = "miniwob/click-button.html" if task == "levels": tasks = MINIWOB_CHALLENGES else: tasks = [[task]] print("Selected tasks:", tasks) NUM_ACTIONS = 1 envs = make_vec_envs( [make_env(tasks[i % len(tasks)]) for i in range(args.num_processes)], receipts) if os.path.exists("./datadir/autoencoder.pt"): dom_autoencoder = torch.load("./datadir/autoencoder.pt") dom_encoder = dom_autoencoder.encoder for param in dom_encoder.parameters(): param.requires_grad = False else: print("No dom encoder") dom_encoder = None actor_critic = Policy( envs.observation_space.shape, gym.spaces.Discrete(NUM_ACTIONS), # envs.action_space, base=GNNBase, base_kwargs={ "dom_encoder": dom_encoder, "recurrent": args.recurrent_policy }, ) actor_critic.dist = NodeObjective() actor_critic.to(device) if args.algo == "a2c": agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == "ppo": agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == "acktr": agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator(envs.observation_space.shape[0], 100, device) rr = ReplayRepository("/code/miniwob-plusplus-demos/*turk/*") ds = rr.get_dataset() print("GAIL Replay Dataset", ds) gail_train_loader = torch_geometric.data.DataLoader( ds, batch_size=args.gail_batch_size, shuffle=True, drop_last=True) from tensorboardX import SummaryWriter import datetime ts_str = datetime.datetime.fromtimestamp( time.time()).strftime("%Y-%m-%d_%H-%M-%S") tensorboard_writer = SummaryWriter( log_dir=os.path.join("/tmp/log", ts_str)) rollouts = ReceiptRolloutStorage( args.num_steps, args.num_processes, (1, ), # envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, receipts, ) # resume from last save if args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass model_path = os.path.join(save_path, args.env_name + ".pt") if False and os.path.exists(model_path): print("Loadng previous model:", model_path) actor_critic = torch.load(model_path) actor_critic.train() obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs)) rollouts.to(device) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print("Iterations:", num_updates, args.num_steps) for j in range(num_updates): episode_rewards = deque(maxlen=args.num_steps * args.num_processes) if j and last_action_time + 5 < time.time(): # task likely timed out print("Reseting tasks") obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs)) rollouts.recurrent_hidden_states[0].copy_( torch.zeros_like(rollouts.recurrent_hidden_states[0])) rollouts.masks[0].copy_(torch.zeros_like(rollouts.masks[0])) if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr, ) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( receipts.redeem(rollouts.obs[step]), rollouts.recurrent_hidden_states[step], rollouts.masks[step], ) # Obser reward and next obs last_action_time = time.time() obs, reward, done, infos = envs.step(action) for e, i in enumerate(infos): if i.get("real_action") is not None: action[e] = i["real_action"] if i.get("bad_transition"): action[e] = torch.zeros_like(action[e]) for info in infos: if "episode" in info.keys(): episode_rewards.append(info["episode"]["r"]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if "bad_transition" in info.keys() else [1.0] for info in infos]) rollouts.insert( torch.tensor(obs), recurrent_hidden_states, action, action_log_prob, value, torch.tensor(reward).unsqueeze(1), masks, bad_masks, ) with torch.no_grad(): next_value = actor_critic.get_value( receipts.redeem(rollouts.obs[-1]), rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() if args.gail: # if j >= 10: # envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): obsfilt = lambda x, update: x # utils.get_vec_normalize(envs)._obfilt gl = discr.update(gail_train_loader, rollouts, obsfilt) print("Gail loss:", gl) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( receipts.redeem(rollouts.obs[step]), rollouts.actions[step], args.gamma, rollouts.masks[step], ) rollouts.compute_returns( next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits, ) value_loss, action_loss, dist_entropy = agent.update(rollouts) obs_shape = rollouts.obs.size()[2:] obs = rollouts.obs[:-1].view(-1, *obs_shape) obs = obs[torch.randint(0, obs.size(0), (1, 32))] rollouts.after_update() receipts.prune(rollouts.obs) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass model_path = os.path.join(save_path, args.env_name + ".pt") torch.save(actor_critic, model_path) print("Saved model:", model_path) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss, )) from pprint import pprint pprint(LevelTracker.global_scoreboard) # tensorboard_writer.add_histogram( # "task_ranks", torch.tensor(predictor._difficulty_rank), total_num_steps # ) tensorboard_writer.add_histogram("value", value, total_num_steps) tensorboard_writer.add_histogram("x", actor_critic.base.last_x, total_num_steps) tensorboard_writer.add_histogram("query", actor_critic.base.last_query, total_num_steps) tensorboard_writer.add_histogram("inputs_at", actor_critic.base.last_inputs_at, total_num_steps) tensorboard_writer.add_scalar("mean_reward", np.mean(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("median_reward", np.median(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("min_reward", np.min(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("max_reward", np.max(episode_rewards), total_num_steps) tensorboard_writer.add_scalar("dist_entropy", dist_entropy, total_num_steps) tensorboard_writer.add_scalar("value_loss", value_loss, total_num_steps) tensorboard_writer.add_scalar("action_loss", action_loss, total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate( actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device, )
def main(): realEval = True #False gettrace = getattr(sys, 'gettrace', None) parser = argparse.ArgumentParser(description='RL') parser.add_argument('--action-type', type=int, default=-1, help='action type to play (default: -1)') parser.add_argument('--tasks-difficulty-from', type=int, default=0, help='tasks_difficulty_from') parser.add_argument('--tasks-difficulty-to', type=int, default=100000, help='tasks-difficulty-to') parser.add_argument('--verboseLevel', type=int, default=5, help='verboseLevel') parser.add_argument('--filesNamesSuffix', default="", help='filesNamesSuffix') parser.add_argument('--nobest-exit', type=int, default=10000, help='nobest_exit') args = get_args(parser) args.algo = 'ppo' args.env_name = 'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' args.use_gae = True args.num_steps = 2048 #args.num_processes = 4 args.num_processes = 4 if gettrace(): args.num_processes = 1 args.lr = 0.0001 args.entropy_coef = 0.0 args.value_loss_coef = 0.5 args.ppo_epoch = 4 args.num_mini_batch = 256 args.gamma = 0.99 args.gae_lambda = 0.95 args.clip_param = 0.2 args.use_linear_lr_decay = True #True #True #True #True args.use_proper_time_limits = True args.save_dir = "./trained_models/" + args.env_name + "/" args.load_dir = "./trained_models/" + args.env_name + "/" args.log_dir = "./logs/robot" if gettrace(): args.save_dir = "./trained_models/" + args.env_name + "debug/" args.load_dir = "./trained_models/" + args.env_name + "debug/" args.log_dir = "./logs/robot_d" args.log_interval = 30 args.hidden_size = 64 args.last_hidden_size = args.hidden_size args.recurrent_policy = False #True args.save_interval = 20 #args.seed = 1 reward_shaping = 0.01 allowMutate = False if args.seed == -1: args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME) quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to # 0 is a walk # 1 is a balance # 2 multitasks # 3 multitask experiments trainType = 14 filesNamesSuffix = "" if args.action_type >= 0: trainType = args.action_type makeEnvFunction = makeEnv.make_env_with_best_settings if trainType == 1: filesNamesSuffix = "balance_" makeEnvFunction = makeEnv.make_env_for_balance if trainType == 2: filesNamesSuffix = "analytical_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical if trainType == 3: filesNamesSuffix = "analytical2_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2 if trainType == 4: filesNamesSuffix = "frontback_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back if trainType == 5: filesNamesSuffix = "leftright_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right if trainType == 6: filesNamesSuffix = "all_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 7: filesNamesSuffix = "rotate_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate if trainType == 8: filesNamesSuffix = "compound_" makeEnvFunction = make_env_multinetwork if trainType == 9: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "test_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test if trainType == 10: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "zoo_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo if trainType == 11: args.hidden_size = 128 #64 #128 args.last_hidden_size = args.hidden_size import pickle if gettrace(): args.num_processes = 1 else: args.num_processes = 8 realEval = False allowMutate = False args.lr = 0.00001 args.use_linear_lr_decay = True #False args.num_env_steps = 10000000 filesNamesSuffix = "zigote2_updown_" print("Samples preload") global samplesEnvData samplesEnvData = pickle.load( open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb")) # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) ) makeEnvFunction = makeSamplesEnv if trainType == 12: import pickle args.lr = 0.00001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size filesNamesSuffix = "zigote2_front_back_" args.clip_param = 0.9 args.value_loss_coef = 0.9 makeEnvFunction = makeEnv.make_env_with_best_settings_for_train #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) ) if trainType == 13: filesNamesSuffix = "all_bytasks_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 14: #args.lr = 0.00001 #args.num_env_steps = 000000 #args.clip_param = 0.5 #args.value_loss_coef =0.8 #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME)) #args.num_steps = random.choice([256,512,1024,2048,4096]) #args.num_mini_batch = random.choice([32,64,256,512]) #args.ppo_epoch = random.choice([2,4,8,10]) #args.clip_param = random.choice([0.2,0.4,0.6,0.8]) #args.value_loss_coef =random.choice([0.4,0.5,0.6,0.8]) #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005]) args.num_steps = 2048 args.num_mini_batch = 64 args.ppo_epoch = 8 args.lr = 0.0001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size # filesNamesSuffix = args.filesNamesSuffix makeEnvFunction = makeEnv.make_env_with_best_settings_for_all ''' num_steps: 1024 num_mini_batch 64 ppo_epoch 2 clip_param: 0.2 value_loss_coef 0.6 lr 0.0001 ''' if trainType == 15: args.num_env_steps = 5000000 filesNamesSuffix = "zigote_updown_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic if trainType == 16: args.lr = 0.00001 filesNamesSuffix = "compound_tasks_" makeEnvFunction = make_env_multinetwork reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping) print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed, "num env steps:", args.num_env_steps, " tasks_dif", args.tasks_difficulty_from, args.tasks_difficulty_to) print("Num processes:", args.num_processes) print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch, "ppo_epoch", args.ppo_epoch) print("clip_param:", args.clip_param, "value_loss_coef", args.value_loss_coef, "lr", args.lr) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args.log_dir = "/tmp/tensorboard/" #TesnorboardX writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, "ppo")) writer.add_scalar('options/num_steps', args.num_steps, 0) writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0) writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0) writer.add_scalar('options/clip_param', args.clip_param, 0) writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0) writer.add_scalar('options/lr', args.lr, 0) device = torch.device("cuda:0" if args.cuda else "cpu") torch.set_num_threads(1) load_dir = os.path.join(args.load_dir, args.algo) multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"] if trainType == 8: for net in multiNetworkName: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) multiNetworkName2 = [ "all_bytasks_0_", "all_bytasks_1_", "all_bytasks_2_", "all_bytasks_3_", "all_bytasks_4_", "all_bytasks_5_", "all_bytasks_6_", "all_bytasks_7_", "all_bytasks_8_", "all_bytasks_9_", "all_bytasks_10_", "all_bytasks_11_", "all_bytasks_12_", ] if trainType == 16: for net in multiNetworkName2: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, None, device, False, normalizeOb=False, normalizeReturns=False, max_episode_steps=args.num_steps, makeEnvFunc=makeEnvFunction, num_frame_stack=1, info_keywords=( 'episode_steps', 'episode_reward', 'progress', 'servo', 'distToTarget', )) #print(envs.observation_space.shape,envs.action_space) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_size, 'last_hidden_size': args.last_hidden_size, 'activation_layers_type': "Tanh" }) ''' # if args.load_dir not None: load_path = os.path.join(args.load_dir, args.algo) actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt")) ''' load_path = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth" loadPretrained = False if loadPretrained and os.path.isfile(preptrained_path): print("Load preptrained") abj = torch.load(preptrained_path) print(abj) print(actor_critic.base) actor_critic.base.load_state_dict() actor_critic.base.eval() if os.path.isfile(load_path) and not loadPretrained: actor_critic, ob_rms = torch.load(load_path) actor_critic.eval() print("----NN loaded: ", load_path, " -----") else: bestFilename = os.path.join( load_dir, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) if os.path.isfile(bestFilename): actor_critic, ob_rms = torch.load(bestFilename) actor_critic.eval() print("----NN loaded: ", bestFilename, " -----") maxReward = -10000.0 maxSteps = 0 minDistance = 50000.0 actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) deque_maxLen = 10 episode_rewards = deque(maxlen=deque_maxLen) episode_steps = deque(maxlen=deque_maxLen) episode_rewards_alive = deque(maxlen=deque_maxLen) episode_rewards_progress = deque(maxlen=deque_maxLen) episode_rewards_servo = deque(maxlen=deque_maxLen) episode_dist_to_target = deque(maxlen=deque_maxLen) ''' load_path = os.path.join(args.load_dir, args.algo) load_path = os.path.join(load_path, args.env_name + ".pt") actor_critic, ob_rms = torch.load(load_path) actor_critic.to(device) actor_critic.eval() #ob_rms.eval() ''' ''' args.use_gym_monitor = 1 args.monitor_dir = "./results/" monitor_path = os.path.join(args.monitor_dir, args.algo) monitor_path = os.path.join(monitor_path, args.env_name) args. if args.use_gym_monitor: env = wrappers.Monitor( env, monitor_path, video_callable=False, force=True) ''' i_episode = 0 save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass trainOnSamplesAndExit = False #False if trainOnSamplesAndExit: import pickle print("---------------------------------------") print("Samples preload") data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb")) #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) ) learning_rate = 0.0001 max_episodes = 100 max_timesteps = 4000 betas = (0.9, 0.999) log_interval = 1 envSamples = SamplesEnv(data) envSamples.numSteps = max_timesteps # create a stochastic gradient descent optimizer optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(), lr=learning_rate, betas=betas) #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) # create a loss function criterion = nn.MSELoss(reduction="sum") # run the main training loop for epoch in range(max_episodes): state = envSamples.reset() time_step = 0 testReward = 0 testSteps = 0 loss_sum = 0 loss_max = 0 for t in range(max_timesteps): time_step += 1 nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device) optimizer.zero_grad() net_out = actor_critic.base.forwardActor(nn_state) net_out = actor_critic.dist.fc_mean(net_out) state, reward, done, info = envSamples.step( net_out.detach().numpy()) sim_action = envSamples.recordedActions sim_action_t = torch.FloatTensor([sim_action]).to(device) loss = criterion(net_out, sim_action_t) loss.backward() optimizer.step() loss_sum += loss.mean() loss_max = max(loss_max, loss.max()) testReward += reward testSteps += 1 if done: if epoch % log_interval == 0: #print(best_action_t*scaleActions-net_out*scaleActions) if args.verboseLevel > 0: print( 'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}' .format(epoch, t, testReward, loss_sum / t, loss_max)) print(info) reward = 0 break bestFilename = os.path.join( save_path, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) exit(0) skipWriteBest = True if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) lock(actor_critic, first=False, last=False) #if trainType==9: #allowMutate = False #lock(actor_critic,first=True,last=False) #mutate(actor_critic,power=0.00,powerLast=0.3) if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) #from torchsummary import summary #summary(actor_critic.base.actor, (1, 48, 64)) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episodeBucketIndex = 0 maxReward = -10000000000 numEval = 10 if realEval: envEval = makeEnvFunction(args.env_name) if hasattr(envEval.env, "tasks") and len(envEval.env.tasks): numEval = max(numEval, len(envEval.env.tasks)) maxReward = evaluate_policy(envEval, actor_critic, numEval * 2, render=False, device=device, verbose=args.verboseLevel) print("MaxReward on start", maxReward) noMaxRewardCount = 0 updateIndex = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) episode_r = 0.0 stepsDone = 0 for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #envs.venv.venv.venv.envs[0].render() if args.verboseLevel > 0: index = 0 for d in done: if d: print(infos[index], flush=True) index += 1 episodeDone = False ''' index = 0 for d in done: if d: print("") print(infos[index]) index+=1 ''' for info in infos: if 'reward' in info.keys(): episodeDone = True i_episode += 1 episode_rewards.append(info['reward']) writer.add_scalar('reward/episode', info['reward'], i_episode) #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget']) if 'steps' in info.keys(): episode_steps.append(info['steps']) writer.add_scalar('reward/steps', info['steps'], i_episode) if 'alive' in info.keys(): episode_rewards_alive.append(info['alive']) writer.add_scalar('reward/alive', info['alive'], i_episode) if 'prog' in info.keys(): episode_rewards_progress.append(info['prog']) writer.add_scalar('reward/progress', info['prog'], i_episode) if 'servo' in info.keys(): episode_rewards_servo.append(info['servo']) writer.add_scalar('reward/servo', info['servo'], i_episode) if 'd2T' in info.keys(): episode_dist_to_target.append(info['d2T']) writer.add_scalar('reward/distToTarget', info['d2T'], i_episode) for val in info.keys(): if val not in [ "reward", "steps", "alive", "prog", "servo", "d2T", 'epos', 't' ]: writer.add_scalar('reward/' + val, info[val], i_episode) #if episodeDone and i_episode%10==0: # print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True) if episodeDone: episodeBucketIndex += 1 if args.verboseLevel > 0: print("Mean:", Fore.WHITE, np.mean(episode_rewards), Style.RESET_ALL, " Median:", Fore.WHITE, np.median(episode_rewards), Style.RESET_ALL, " max reward:", maxReward) #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and''' if realEval: if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval: print("Step:", (j + 1) * args.num_processes * args.num_steps) if skipWriteBest == False: evalReward = evaluate_policy( envEval, actor_critic, numEval, device=device, verbose=args.verboseLevel) writer.add_scalar('reward/eval', evalReward, i_episode) if evalReward > maxReward: maxReward = evalReward #maxReward = np.mean(episode_rewards) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format( maxReward, np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) noMaxRewardCount = 0 else: noMaxRewardCount += 1 if allowMutate: if noMaxRewardCount == 5: print("Mutation low last layer") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.00, powerLast=0.01) if noMaxRewardCount == 8: print("Mutation low non last") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.01, powerLast=0.0) if noMaxRewardCount == 11: print("Mutation low all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.02, powerLast=0.2) if noMaxRewardCount == 14: print("Mutation hi all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.03, powerLast=0.03) noMaxRewardCount = 0 if noMaxRewardCount == args.nobest_exit: exit(0) else: skipWriteBest = False else: if len(episode_rewards) and np.mean( episode_rewards ) > maxReward and j > args.log_interval: if skipWriteBest == False: maxReward = np.mean(episode_rewards) writer.add_scalar('reward/maxReward', maxReward, i_episode) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) if len(episode_dist_to_target): print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}/{:.2f}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) else: print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps)), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) else: skipWriteBest = False # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) shaped_reward = reward_shaper(reward) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, shaped_reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('reward/value_loss', value_loss, updateIndex) writer.add_scalar('reward/action_loss', action_loss, updateIndex) writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex) updateIndex += 1 rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": ''' fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], fileName) print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards)) fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save(actor_critic.state_dict, fileName) print("Saved:",fileName) ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if args.verboseLevel > 0: print("") print("Updates {}, num timesteps {}, FPS {}".format( j, total_num_steps, int(total_num_steps / (end - start)))) print(" Last {} training episodes:".format( len(episode_rewards))) print( " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_steps), np.median(episode_steps), np.min(episode_steps), np.max(episode_steps))) if len(episode_rewards_alive): print( " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_alive), np.median(episode_rewards_alive), np.min(episode_rewards_alive), np.max(episode_rewards_alive))) if len(episode_rewards_progress): print( " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_progress), np.median(episode_rewards_progress), np.min(episode_rewards_progress), np.max(episode_rewards_progress))) if len(episode_rewards_servo): print( " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_servo), np.median(episode_rewards_servo), np.min(episode_rewards_servo), np.max(episode_rewards_servo))) if len(episode_dist_to_target): print( " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}" .format(np.mean(episode_dist_to_target), np.median(episode_dist_to_target), np.min(episode_dist_to_target), np.max(episode_dist_to_target))) print( " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n" .format( np.mean(episode_rewards) / np.mean(episode_steps), (0 if len(episode_rewards_progress) == 0 else np.mean(episode_rewards_progress) / np.mean(episode_steps)), dist_entropy, value_loss, action_loss))
def run(self): args = self.args torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print("CUDA is available: ", torch.cuda.is_available()) if args.cuda: print("CUDA enabled") torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True else: if args.cuda_deterministic: print("Warning CUDA is requested but is not available") else: print("CUDA disabled") log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) print("get_num_thread", torch.get_num_threads()) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, self.config_parameters, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = create_IAM_model(envs, args, self.config_parameters) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) # This algorithm should be used for the reproduction project. elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # Always return the average of the last 100 steps. This means the average is sampled. episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, self.model_file_name)) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() elapsed_time = end - start data = [ j, # Updates total_num_steps, # timesteps int(total_num_steps / elapsed_time), # FPS len(episode_rewards), # Only useful for print statement np.mean(episode_rewards), # mean of rewards np.median(episode_rewards), # median of rewards np.min(episode_rewards), # min rewards np.max(episode_rewards), # max rewards dist_entropy, value_loss, action_loss, elapsed_time ] output = ''.join([str(x) + ',' for x in data]) self.data_saver.append(output) print( f"Updates {data[0]}, num timesteps {data[1]}, FPS {data[2]}, elapsed time {int(data[11])} sec. Last {data[3]} training episodes: mean/median reward {data[4]:.2f}/{data[5]:.2f}, min/max reward {data[6]:.1f}/{data[7]:.1f}", end="\r") if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() toke = tokenizer() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() tobs = torch.zeros((args.num_processes, trace_size), dtype=torch.long) #print (tobs.dtype) rollouts.obs[0].copy_(obs) rollouts.tobs[0].copy_(tobs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.tobs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) tobs = [] envs.render() for info in infos: if 'episode' in info.keys(): #print ("episode ", info['episode']) episode_rewards.append(info['episode']['r']) trace = [x.inst for x in info['trace']] trace = trace[0:trace_size] word_to_ix = toke.tokenize(trace) seq = prepare_sequence(trace, word_to_ix) if len(seq) < trace_size: seq = torch.zeros((trace_size), dtype=torch.long) seq = seq[:trace_size] #print (seq.dtype) tobs.append(seq) tobs = torch.stack(tobs) #print (tobs) #print (tobs.size()) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, tobs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.tobs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def onpolicy_main(): print("onpolicy main") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv') > -1: if args.num_processes > 1: visionnet_input = envs.venv.venv.visionnet_input nn = envs.venv.venv.nn env_name = envs.venv.venv.xml_path else: visionnet_input = envs.venv.venv.envs[ 0].env.env.env.visionnet_input nn = envs.venv.venv.envs[0].env.env.env.nn env_name = envs.venv.venv.envs[0].env.env.env.xml_path dummy_obs = np.zeros(nn * 2 + 3) else: dummy_obs = envs.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy(dummy_obs.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm = get_vec_normalize(envs) vec_norm.eval() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, dummy_obs.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) full_obs = envs.reset() initial_state = full_obs[:, :envs.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs, 0) else: obs = full_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) next_action = action if pos_control: frame_skip = 2 if step % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = envs.step(next_action) current_state = full_obs[:, :envs.action_space.shape[0]] else: full_obs, reward, done, infos = envs.step(next_action) # convert img to obs if door_env and using visionnet if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( save_path, args.env_name + "_{}.{}.pt".format(args.save_name, j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR = False # True #Domain Randomization ################## for multiprocess world change ###################### if DR: print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) full_obs = envs.reset() if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def train_maml_like_ppo_( init_model, args, learning_rate, num_episodes=20, num_updates=1, vis=False, run_idx=0, use_linear_lr_decay=False, ): num_steps = num_episodes * 100 torch.set_num_threads(1) device = torch.device("cpu") envs = make_vec_envs(ENV_NAME, seeding.create_seed(None), NUM_PROC, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) raw_env = navigation_2d.unpeele_navigation_env(envs, 0) # raw_env.set_arguments(args.rm_nogo, args.reduce_goals, True, args.large_nogos) new_task = raw_env.sample_tasks(run_idx) raw_env.reset_task(new_task[0]) # actor_critic = Policy( # envs.observation_space.shape, # envs.action_space, # base_kwargs={'recurrent': args.recurrent_policy}) actor_critic = copy.deepcopy(init_model) actor_critic.to(device) agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=learning_rate, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(num_steps, NUM_PROC, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) fitnesses = [] for j in range(num_updates): # if args.use_linear_lr_decay: # # decrease learning rate linearly # utils.update_linear_schedule( # agent.optimizer, j, num_updates, # agent.optimizer.lr if args.algo == "acktr" else args.lr) min_c_rew = float("inf") vis = [] offending = [] for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) if done[0]: c_rew = infos[0]["cummulative_reward"] vis.append((infos[0]['path'], infos[0]['goal'])) offending.extend(infos[0]['offending']) if c_rew < min_c_rew: min_c_rew = c_rew # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() ob_rms = utils.get_vec_normalize(envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms fits, info = evaluate(actor_critic, ob_rms, envs, NUM_PROC, device) print(f"fitness {fits} update {j+1}") if (j+1) % 1 == 0: vis_path(vis, eval_path_rec=info['path'], offending=offending) fitnesses.append(fits) return fitnesses[-1], info[0]['reached'], None
def inner_loop_ppo( weights, args, learning_rate, num_steps, num_updates, run_idx, input_envs, ): torch.set_num_threads(1) device = torch.device("cpu") #print(input_envs.venv.spec._kwargs['config']['goal_locations']) #env_name = register_set_goal(run_idx) #envs = make_vec_envs(env_name, np.random.randint(2**32), NUM_PROC, # args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) actor_critic = init_ppo(input_envs, log(args.init_sigma)) actor_critic.to(device) # apply the weights to the model apply_from_list(weights, actor_critic) agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=learning_rate, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(num_steps, NUM_PROC, input_envs.observation_space.shape, input_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = input_envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) fitnesses = [] violation_cost = 0 for j in range(num_updates): episode_step_counter = 0 for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, (final_action, _) = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = input_envs.step(final_action) episode_step_counter += 1 # Count the cost total_reward = reward for info in infos: violation_cost += info['cost'] total_reward -= info['cost'] # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, total_reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() ob_rms = utils.get_vec_normalize(input_envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms fits, info = evaluate(actor_critic, ob_rms, input_envs, NUM_PROC, device) fitnesses.append(fits) return (fitnesses[-1]), 0, 0