def main(): print("config:\n") print("activation:", args.activation) print("evaluation:", args.evaluation) print("evaluation mode:", args.evaluation_mode) print("evaluation layer:", args.evaluation_layer) writer = SummaryWriter() torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, activation = args.activation, modulation = args.evaluation) # load trained model if args.load_model_path != None: state_dicts = torch.load(args.load_model_path) actor_critic.load_nets(state_dicts) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) # elif args.algo == 'ppo': # agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, # args.value_loss_coef, args.entropy_coef, lr=args.lr, # eps=args.eps, # max_grad_norm=args.max_grad_norm) # elif args.algo == 'acktr': # agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, # args.entropy_coef, acktr=True) tonic_g = 1 phasic_g = 1 if args.evaluation and args.evaluation_layer == 1: # f1 modulation tonic_g = args.f1_tonic_g phasic_g = args.f1_phasic_g if args.evaluation and args.evaluation_layer == 0: # input activation tonic_g = args.input_tonic_g phasic_g = args.input_phasic_g g = torch.ones(args.num_processes,1)*tonic_g g_device = (torch.ones(args.num_processes,1)*tonic_g).to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, tonic_g) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() pre_value = [None for i in range(args.num_processes)] evaluations = [0 for i in range(args.num_processes)] ## to calculate next_value and update g next_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size).to(device) next_g = torch.zeros(args.num_processes,1).to(device) next_masks = torch.zeros(args.num_processes,1).to(device) next_obs = torch.zeros(args.num_processes, *envs.observation_space.shape).to(device) for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.g[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) # calculate next value with old g and decide new g if args.evaluation: if args.evaluation_layer == 0: next_obs.copy_(neural_activity(obs,g_device)) else: next_obs.copy_(obs/255) next_recurrent_hidden_states.copy_(recurrent_hidden_states) next_g.copy_(g) next_masks.copy_(masks) with torch.no_grad(): next_value = actor_critic.get_value(next_obs, next_g, next_recurrent_hidden_states, next_masks).detach() evaluations, g, pre_value = calc_modes(reward, next_value, pre_value, evaluations, args.evaluation_mode, tonic_g, phasic_g, masks) g_device.copy_(g) # observation processing with new g if args.evaluation and args.evaluation_layer == 0: obs = neural_activity(obs, g_device) else: obs = obs/255.0 for idx in range(len(infos)): info = infos[idx] if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) steps_done = j*args.num_steps*args.num_processes + step*args.num_processes + idx writer.add_scalar('data/reward', info['episode']['r'], steps_done ) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, g) # record evaluation value to help decide parameters to switch modes if args.evaluation_log: writer.add_scalar('data/evaluations', evaluations[0], j*args.num_steps*args.num_processes + step*args.num_processes) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.g[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass state_dicts = actor_critic.save_nets() torch.save(state_dicts, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): print('Preparing parameters') torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print('Creating envs: {}'.format(args.env_name)) envs = test_mp_envs(args.env_name, args.num_processes) print('Creating network') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('Initializing PPO') agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) print('Memory') rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] num_episodes = [0 for _ in range(args.num_processes)] last_index = 0 print('Starting ! ') start = time.time() for j in tqdm(range(num_updates)): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) for info_num, info in enumerate(infos): if info_num == 0: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # end_episode_to_viz(writer, info, info_num, num_episodes[info_num]) num_episodes[info_num] += 1 plot_rewards(episode_rewards, args) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) losses = agent.update(rollouts) rollouts.after_update()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) average_actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) average_actor_critic.load_state_dict(actor_critic.state_dict()) actor_critic.to(device) average_actor_critic.to(device) agent = algo.ACER_AGENT(actor_critic, average_actor_critic, args.value_loss_coef, args.entropy_coef, args.gamma, args.clip, args.no_trust_region, args.alpha, args.delta, lr=args.lr, eps=args.eps, rms_alpha=args.rms_alpha, max_grad_norm=args.max_grad_norm) buffer = Buffer(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, args.buffer_size) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) off_rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) off_rollouts.to(device) episode_rewards = deque(maxlen=10) acer = algo.ACER(actor_critic, rollouts, off_rollouts, buffer, episode_rewards, agent, envs) start = time.time() for j in range(num_updates): # On-policy ACER value_loss, action_loss, dist_entropy = acer.call(on_policy=True) if args.replay_ratio > 0 and buffer.has_atleast(args.replay_start): # Off-policy ACER n = np.random.poisson(args.replay_ratio) for _ in range(n): acer.call(on_policy=False) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \nLast {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\ndist_entropy {:.1f}, value/action loss {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) eval_episode_rewards = [] obs = eval_envs.reset().to(device) eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, _, _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) obs = obs.to(device) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): args = get_args() args.num_processes = 16 args.env_name = 'BreakoutNoFrameskip-v4' torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def main(): print('Preparing parameters') torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # print('Initializing visdom') # if args.vis: # from visdom import Visdom # viz = Visdom(port=args.port) # win = None print('Creating envs') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) print('Creating network') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('Initializing PPO') agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) print('Memory') rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) # ===================== TB visualisation ================= writer = SummaryWriter() last_index = 0 print('Starting ! ') start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('Agents metrics/Policy loss', action_loss, j) writer.add_scalar('Agents metrics/Value loss', value_loss, j) writer.add_scalar('Agents metrics/Entropy loss', dist_entropy, j) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs # win, tx, ty = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) tx, ty = get_reward_log(args.log_dir) if tx != None and ty != None: max_index = len(tx) for ind_iter in range(last_index, max_index): writer.add_scalar('Reward', ty[ind_iter], tx[ind_iter]) last_index = max_index # tx, ty = get_reward_log(viz, win, args.log_dir, args.env_name, # args.algo, args.num_frames) # if tx != None and ty != None: # plt.cla() # plt.plot(tx,ty) # plt.pause(0.1) # plt.show() # if(ty != None and tx != None): # input(ty) # writer.add_scalar('Reward', ty[-1], tx[-1]) # if(tx != None and ty != None): # plt.cla() # plt.plot(tx, ty) # plt.pause(0.1) except IOError: pass
def main(args): env = GymEnvironment(args, gamma) env.env = env.env.unwrapped actor_critic = Policy(obs_shape, env.action_size, base_kwargs={'recurrent': False}) actor_critic.load_state_dict(torch.load('log/model.pt')) actor_critic.to(device) agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs, _, _, _ = env.new_expt() obs = obs[np.newaxis, ...] current_obs[:, -1] = torch.from_numpy(obs) rollouts.obs[0].copy_(current_obs) current_obs = current_obs.to(device) rollouts.to(device) num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps)) n_goal_reached = 0 n_episodes = 0 for j in range(num_updates): for step in range(num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() (obs, reward, done), goal_reached = env.act(action) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) masks = masks.to(device) current_obs[:, :-1] = current_obs[:, 1:] if done: current_obs[:] = 0 current_obs[:, -1] = torch.from_numpy(obs) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if done: n_episodes += 1 env.new_expt() if goal_reached: n_goal_reached += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau, step) value_loss, action_loss, dist_entropy = agent.update(rollouts, step) rollouts.after_update() if j % log_interval == 0: total_num_steps = (j + 1) * num_processes * num_steps try: success = float(n_goal_reached) / n_episodes except ZeroDivisionError: success = 0. print( "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format( total_num_steps, n_goal_reached, n_episodes, success)) if args.lang_coeff > 0: av_list = np.array(env.action_vectors_list) for k in range(len(spearman_corr_coeff_actions)): sr, _ = spearmanr(env.rewards_list, av_list[:, k]) print(k, sr)
def setup(model_setting, algorithm, device, _run, _log, log, seed, cuda): """ All args are automatically provided by sacred Some of the important objects created in this function are: - parallel environments (using SubprocVecEnv from OpenAI baselines) - instance of model (BMIL) - experience replay - RolloutStorage: a helper class to save rewards and compute the advantage loss """ # Create working dir id_tmp_dir = "{}/{}/".format(log['tmp_dir'], _run._id) helpers.safe_make_dirs(id_tmp_dir) np.set_printoptions(precision=2) torch.manual_seed(seed) np.random.seed(seed) if cuda: torch.cuda.manual_seed(seed) logger = logging.getLogger() if _run.debug or _run.pdb: logger.setLevel(logging.DEBUG) envs = register_and_create_envs(id_tmp_dir) model = create_model(envs) # Experience replay buffer to store off-policy data. replay = ExpReplay(batch_size=algorithm['num_processes_offPol'], max_trajs=1000, fwd_jump=algorithm['forward_jump'], bwd_jump=algorithm['backward_jump']) rollouts = RolloutStorage(algorithm['num_steps'], algorithm['num_processes']) rollouts.to(device) # Reset all environments obs = envs.reset() curr_ob = torch.from_numpy(obs).float() init_state = torch.zeros(algorithm['num_processes'], model_setting['belief_dim']).to(device) init_state_offPol = torch.zeros(algorithm['num_processes_offPol'], model_setting['belief_dim']).to(device) init_episode_reward_info = torch.zeros([algorithm['num_processes'], 1]) init_ac = torch.zeros(algorithm['num_processes'], envs.action_space.shape[0]).to(device) # Buffer to hold information along the current "on-policy" path. curr_memory = { 'curr_ob': curr_ob, # o_t 'prev_belief': init_state, # b_{t-1} 'prev_ac': init_ac, # a_{t-1} 'prev_ob': curr_ob.clone(), # o_{t-1} 'expert_ac': init_ac.clone(), 'episode_reward_info': init_episode_reward_info } # Buffer to hold information along the current "off-policy" path. curr_memory_offPol = { 'curr_ob': None, 'prev_ob': None, 'prev_belief': init_state_offPol, 'prev_ac': None, 'ob_tpk': None, # o_{t+k} 'ob_tmkm1': None, # o_{t-k-1} 'future_k_acs': None, # a_t:a_{t+k-1} 'past_k_acs': None, # a_{t-k-1}:a_{t-2} 'future_mask': None, # mask for o_{t+k} 'past_mask': None # mask for o_{t-k-1} } return envs, model, rollouts, curr_memory, curr_memory_offPol, replay
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.obs[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) current_obs = current_obs.to(device) rollouts.to(device) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks masks = masks.to(device) if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) with open(log_dir + 'extras.csv', "w") as file: file.write("n, value_loss\n") torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) model = Policy(envs.observation_space.shape, envs.action_space.n, extra_kwargs={'use_backpack': args.algo == 'tdprop'}) model.to(device) if args.algo == 'tdprop': from algo.sarsa_tdprop import SARSA agent = SARSA(model, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta_1=args.beta_1, beta_2=args.beta_2, n=args.num_steps, num_processes=args.num_processes, gamma=args.gamma) else: from algo.sarsa import SARSA agent = SARSA(model, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta_1=args.beta_1, beta_2=args.beta_2, algo=args.algo) explore_policy = utils.eps_greedy rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, model.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): qs = model(rollouts.obs[step]) _, dist = explore_policy(qs, args.exploration) actions = dist.sample().unsqueeze(-1) value = qs.gather(-1, actions) # Obser reward and next obs obs, reward, done, infos = envs.step(actions) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, torch.FloatTensor([0.0]), actions, value, value, reward, masks, bad_masks) with torch.no_grad(): next_qs = model(rollouts.obs[-1]) next_probs, _ = explore_policy(next_qs, args.exploration) next_value = (next_probs * next_qs).sum(-1).unsqueeze(-1) rollouts.compute_returns(next_value, args.gamma) value_loss = agent.update(rollouts, explore_policy, args.exploration) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1): save_path = os.path.join(args.log_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ list(model.parameters()), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( ("Updates {}, num timesteps {}, FPS {}\n" + \ "Last {} training episodes: mean/median reward {:.1f}/{:.1f}" + \ ", min/max reward {:.1f}/{:.1f}\n" + \ "entropy {:.2f}, value loss {:.4f}") .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist.entropy().mean().item(), value_loss)) with open(log_dir + 'extras.csv', "a") as file: file.write( str(total_num_steps) + ", " + str(value_loss) + "\n")
def main(): import matplotlib.pyplot as plt # You probably won't need this if you're embedding things in a tkinter plot... plt.ion() x = np.linspace(0, 6 * np.pi, 100) y = np.sin(x) fig = plt.figure() ax = fig.add_subplot(111) import time line1, = ax.plot([0, 1, 2], [0, 1, 1], 'r-') # Returns a tuple of line objects, thus the comma time.sleep(0.01) torch.set_num_threads(1) args.num_processes = 1 # device = torch.device("cuda:0" if args.cuda else "cpu") device = torch.device("cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = TorchRunner(acc=0.005) ob_shape = envs.reset().shape # envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, args.add_timestep, device, False) # actor_critic = Policy(ob_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) # # try to load the previous policy # data = torch.load( # r"C:\Users\clvco\URA_F18\pytorch-a2c-ppo-acktr\trained_models\ppo\weight_positiverev_test.pt") # # # print(data) # actor_critic.load_state_dict(data[0].state_dict()) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) obs = envs.reset() ob_shape = obs.shape rollouts = RolloutStorage(args.num_steps, args.num_processes, ob_shape, envs.action_space, (agent.actor_critic.base.output_size), (1), actor_critic.recurrent_hidden_state_size) print(args.num_processes) print(envs.observation_space.shape) print(obs.shape) print(rollouts.obs[0].shape) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = list() ep_reward = 0 import tqdm start = time.time() print(args) print(int(args.num_frames) // args.num_steps // args.num_processes) print('NUM', num_updates) timestep = 0 ep_ends = [] for j in range(num_updates): if j == 0: print("UPDATING SYNERGY") actor_critic.adjust_synergy(0.0) for step in tqdm.tqdm(range(args.num_steps)): # Sample actions timestep += 1 with torch.no_grad(): value, action, synergy, q, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) ep_reward += reward[0] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if done[0]: obs = envs.reset() episode_rewards.append(ep_reward) ep_ends.append(timestep) ep_reward = 0 # print(action) rollouts.insert(obs, recurrent_hidden_states, action, synergy, q, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model] print("Saving model") torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) print("Saved model to: ", os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps print("update time", print(len(episode_rewards))) if True: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.5f}/{:.5f}, min/max reward {:.5f}/{:.5f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards[-10:]), np.median(episode_rewards[-10:]), np.min(episode_rewards[-10:]), np.max(episode_rewards[-10:]), dist_entropy, value_loss, action_loss)) import time ydata = np.convolve(episode_rewards, np.ones(10) / 10, mode='valid') line1.set_xdata(np.arange(0, len(ydata))) line1.set_ydata(ydata) ax.set_xlim(0, len(ydata)) ax.set_ylim(min(ydata), max(ydata)) fig.canvas.draw() fig.canvas.flush_events() time.sleep(0.01) # save the returns xdata = np.array(ep_ends) ret_dir = 'returns_weight_experiments' os.makedirs(ret_dir, exist_ok=True) ret_path = ret_dir + '/' + args.env_name + '_' + str( args.seed) + '.npy' ep_path = ret_dir + '/' + "x_data-" + args.env_name + '_' + str( args.seed) + '.npy' np.save(ret_path, np.array(np.array(episode_rewards))) np.save(ep_path, ep_ends)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, 1, args.gammas[-1], None, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'num_values': args.num_values, 'sum_values': args.sum_values }) state_dict = torch.load(args.log_dir + '/ppo/' + args.env_name + '.pt') actor_critic.load_state_dict(state_dict[0].state_dict()) actor_critic.to(device) rollouts = RolloutStorage(1, 1, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, tau=args.tau, gammas=args.gammas, use_delta_gamma=args.use_delta_gamma, use_capped_bias=args.use_capped_bias) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] values = [] rewards = [] for num_no_ops in range(30): really_done = False cur_step = 0 while not really_done: with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs, rollouts.recurrent_hidden_states[0], rollouts.masks[0]) if cur_step <= num_no_ops: obs, reward, done, infos = envs.step(torch.zeros((1, 1))) else: # Sample actions # Obser reward and next obs obs, reward, done, infos = envs.step(action) if num_no_ops == 0: if device == 'cpu': rewards.append(reward.numpy()) values.append(value.numpy()) else: rewards.append(reward.cpu().numpy()) values.append(value.cpu().numpy()) if 'episode' in infos[0].keys(): really_done = True episode_rewards.append(infos[0]['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) cur_step += 1 with open(args.log_dir + '/random_rewards.pkl', 'wb') as handle: pickle.dump(episode_rewards, handle) with open(args.log_dir + '/values_timestep.pkl', 'wb') as handle: pickle.dump(values, handle) with open(args.log_dir + '/rewards_timestep.pkl', 'wb') as handle: pickle.dump(rewards, handle)
def main(args): env = GymEnvironment(args, gamma) env.env = env.env.unwrapped actor_critic = Policy(obs_shape, env.action_size, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs, _, _, _ = env.new_expt() obs = obs[np.newaxis, ...] current_obs[:, -1] = torch.from_numpy(obs) rollouts.obs[0].copy_(current_obs) current_obs = current_obs.to(device) rollouts.to(device) num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps)) n_goal_reached = 0 n_episodes = 0 for j in tqdm(range(num_updates), ascii=True): for step in range(num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() (obs, reward, done), goal_reached = env.act(action) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) masks = masks.to(device) current_obs[:, :-1] = current_obs[:, 1:] if done: current_obs[:] = 0 current_obs[:, -1] = torch.from_numpy(obs) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if done: n_episodes += 1 env.new_expt() if goal_reached: n_goal_reached += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau, step) value_loss, action_loss, dist_entropy = agent.update(rollouts, step) rollouts.after_update() torch.save(agent.actor_critic.state_dict(), 'log/model.pt')
def main(): ''' Train PPO policies on each of the training environments. ''' args = get_args() try: os.makedirs(args.log_dir) except OSError: pass torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args, device) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ep_reward = np.zeros(args.num_processes) episode_rewards = deque(maxlen=100) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obs reward and next obs obs, reward, done, infos = envs.step(action) if 'spaceship' in args.env_name: # spaceship, swimmer for i in range(len(done)): if done[i]: episode_rewards.append(reward[i].item()) # elif 'swimmer' in args.env_name: else: for i in range(len(done)): ep_reward[i] += reward[i].numpy().item() if done[i]: episode_rewards.append(ep_reward[i]) ep_reward[i] = 0 # if 'ant' in args.env_name: # for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, True, args.gamma, args.gae_lambda, True) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": try: os.makedirs(args.save_dir) except OSError: pass torch.save( actor_critic.state_dict(), os.path.join(args.save_dir, "ppo.{}.env{}.seed{}.pt"\ .format(args.env_name, args.default_ind, args.seed)) ) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("\nUpdates {}, num timesteps {}, Last {} training episodes: \ \n mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, device) envs.close()
max_grad_norm=parameters['max_grad_norm'], use_adam=parameters['use_adam']) else: agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(parameters['num_steps'], parameters['num_processes'], envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) recent_count = 50 episode_rewards = deque(maxlen=recent_count) episode_lengths = deque(maxlen=recent_count) if args.continue_training: progress = json.load(open(progress_save)) num_updates_init = progress["last_saved_num_updates"] actor_critic.load_state_dict(torch.load(MODEL_SAVE_PATH)) else: num_updates_init = 0 progress = { "last_saved_num_updates": 0 }
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) print('here') if args.env_name == 'Reacher-v2': rbf1 = build_features_reacher2(.2, 5, 2) len_rbf = rbf1._K len_features = len_rbf + 1 if args.env_name == 'Hopper-v2': len_features = 3 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, len_features) print('here2') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) num_updates = 20 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # Prepare demos demo_actions = np.zeros( (1, args.num_processes, envs.action_space.shape[0])) demo_states = np.zeros( (1, args.num_processes, envs.observation_space.shape[0])) demo_features = np.zeros((1, args.num_processes, len_features)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # obs, reward and next obs demo_actions = np.concatenate( [demo_actions, action.reshape(1, args.num_processes, -1)], 0) demo_states = np.concatenate([ demo_states, rollouts.obs[step].reshape( 1, args.num_processes, -1) ], 0) feat_rewards = np.zeros((args.num_processes, len_features)) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_before = envs.get_sim_data() obs, reward, done, infos = envs.step(action) if args.env_name == 'Hopper-v2': if args.num_processes > 1: pos_after = envs.get_sim_data() for num_p in range(args.num_processes): feat_1 = pos_after[num_p] - pos_before[num_p] feat_2 = 0 if not done[num_p]: feat_2 = 1 # feat_2 = np.array([1 for _ in range(args.num_processes)]) feat_3 = np.array( [np.linalg.norm(action[num_p], ord=2)**2]).flatten() feat_rewards[num_p] = np.array( [feat_1, feat_2, feat_3]) if args.env_name == 'Reacher-v2': if args.num_processes > 1: body_data = envs.get_body_data() for num_p in range(args.num_processes): rbf1_ = rbf1(body_data[num_p][:-1]) rbf4_ = np.array( [np.linalg.norm(action[num_p], ord=2)**2]) feat_rewards[num_p] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) else: rbf1_ = rbf1( (envs.envs[0].env.env.get_body_com("fingertip") - envs.envs[0].env.env.get_body_com("target"))[:-1]) rbf4_ = np.array([-np.square(action[0]).sum()]) feat_rewards[0] = np.concatenate( (rbf1_.reshape(-1), rbf4_)) demo_features = np.concatenate([ demo_features, feat_rewards.reshape(1, args.num_processes, -1) ], 0) if step > 1 and step % 1000 == 0: done = [True for _ in range(args.num_processes)] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, \ value, reward, masks, feat_rewards) # Save demos: action_file_name = demos_expe_dir + '/actions_step_' + str(j) + '.npy' state_file_name = demos_expe_dir + '/states_step_' + str(j) + '.npy' rew_feat_file_name = demos_expe_dir + '/rew_feat_step_' + str( j) + '.npy' policy_file_name = demos_expe_dir + '/policy_step_' + str(j) + '.pth' np.save(action_file_name, demo_actions) np.save(state_file_name, demo_states) np.save(rew_feat_file_name, demo_features) torch.save(actor_critic.state_dict(), policy_file_name) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) #print(acc_scores) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act(obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.env_name + '_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.env_name + '_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): device = 'cpu' acc_steps = [] acc_scores = [] torch.set_num_threads(1) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # get cloned policy and recovered reward function policy_reward_dir = args.rewards_dir policy_dir = args.policies_dir policy_reward = Policy(envs.observation_space.shape, envs.action_space) policy_reward_file_name = policy_reward_dir + '/reward_' + args.expe + '.pth' policy_reward_sd = torch.load(policy_reward_file_name) policy_reward.load_state_dict(policy_reward_sd) actor_critic = Policy(envs.observation_space.shape, envs.action_space) policy_file_name = policy_dir + '/last_policy_' + args.expe + '.pth' policy_sd = torch.load(policy_file_name) actor_critic.load_state_dict(policy_sd) actor_critic.to(device) agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = collections.deque(maxlen=10) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule(agent.optimizer, j, num_updates, args.lr) agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, _, done, infos = envs.step(action) if step > 1 and step % 1000 == 0: done = True # use infered reward: with torch.no_grad(): # _, reward = shapes(rollouts.obs[step], 0) _, action_log_probs, _, _ = policy_reward.evaluate_actions( rollouts.obs[step], None, None, action) reward = action_log_probs for info in infos: # if 'episode' in info.keys(): # episode_rewards.append(info['episode']['r']) r = 0 for key, val in info.items(): if 'reward' in key: r += val episode_rewards.append(r) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir: save_path = os.path.join(args.save_dir, 'ppo') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + '.pt')) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: print('Updates', j, 'num timesteps', len(episode_rewards), '\n Last training episodes: mean/median reward', '{:.1f}'.format(np.mean(episode_rewards)), '/{:.1f}'.format(np.median(episode_rewards)), 'min/max reward', '{:.1f}'.format(np.min(episode_rewards)), '/{:.1f}'.format(np.max(episode_rewards)), 'dist entropy', dist_entropy, 'value loss', value_loss, 'action loss', action_loss) if len(episode_rewards) > 1: acc_steps.append(total_num_steps) acc_scores.append(np.mean(episode_rewards)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _ = actor_critic.act( obs, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print('Evaluation using', len(eval_episode_rewards), 'episodes: mean reward', '{:.5f}\n'.format(np.mean(eval_episode_rewards))) scores_file_name = args.scores_dir + '/learner_scores_' + args.expe + '.npy' steps_file_name = args.scores_dir + '/learner_steps_' + args.expe + '.npy' np.save(scores_file_name, np.array(acc_scores)) np.save(steps_file_name, np.array(acc_steps))
def main(): torch.set_num_threads(1) device = torch.device("cuda:1" if args.cuda else "cpu") ## UID = 'exp_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) step_log = [] reward_log = [] ## To be used to selec environment mode = 'normal' # encoder type encoder = 'sym_VAE' if encoder == 'symbolic': embedding_size = (18, ) elif encoder == 'AE': embedding_size = (200, ) elif encoder == 'VAE': embedding_size = (100, ) elif encoder == 'sym_VAE': embedding_size = (118, ) else: raise NotImplementedError('fff') # load pre-trained AE #AE = VAEU([128,128]) #model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_4/VAEU.pth' #AE = torch.load(model_path) #AE.eval() # load pre-trained VAE VAE = VAER([128, 128]) model_path = '/hdd_c/data/miniWorld/trained_models/VAE/dataset_5/VAER.pth' VAE = torch.load(model_path).to(device) VAE.eval() # load pre-trained detector Detector_model = Detector model_path = '/hdd_c/data/miniWorld/trained_models/Detector/dataset_5/Detector_resnet18_e14.pth' Detector_model = torch.load(model_path).to(device) # load pre-trained RNN RNN_model = RNN(200, 128) model_path = '/hdd_c/data/miniWorld/trained_models/RNN/RNN1.pth' RNN_model = torch.load(model_path).to(device) RNN_model.eval() """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) print(envs.observation_space.shape) #actor_critic = Policy(envs.observation_space.shape, envs.action_space, # base_kwargs={'recurrent': args.recurrent_policy}) actor_critic = Policy(embedding_size, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) #rollouts = RolloutStorage(args.num_steps, args.num_processes, # envs.observation_space.shape, envs.action_space, # actor_critic.recurrent_hidden_state_size) rollouts = RolloutStorage(args.num_steps, args.num_processes, embedding_size, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() #print(obs.size()) #obs = make_var(obs) print(obs.size()) with torch.no_grad(): if encoder == 'symbolic': z = Detector_model(obs) print(z.size()) z = Detector_to_symbolic(z) rollouts.obs[0].copy_(z) elif encoder == 'AE': z = AE.encode(obs) rollouts.obs[0].copy_(z) elif encoder == 'VAE': z = VAE.encode(obs)[0] rollouts.obs[0].copy_(z) elif encoder == 'sym_VAE': z_vae = VAE.encode(obs)[0] z_sym = Detector_model(obs) z_sym = Detector_to_symbolic(z_sym) z = torch.cat((z_vae, z_sym), dim=1) rollouts.obs[0].copy_(z) else: raise NotImplementedError('fff') #rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): #print(j) for step in range(args.num_steps): # Sample actions #print(step) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs #print(action) with torch.no_grad(): obs, reward, done, infos = envs.step(action) if encoder == 'symbolic': #print(obs.size()) np.save( '/hdd_c/data/miniWorld/training_obs_{}.npy'.format( step), obs.detach().cpu().numpy()) z = Detector_model(obs / 255.0) z = Detector_to_symbolic(z) #print(z) np.save( '/hdd_c/data/miniWorld/training_z_{}.npy'.format(step), z.detach().cpu().numpy()) elif encoder == 'AE': z = AE.encode(obs) elif encoder == 'VAE': z = VAE.encode(obs)[0] elif encoder == 'sym_VAE': z_vae = VAE.encode(obs)[0] z_sym = Detector_model(obs) z_sym = Detector_to_symbolic(z_sym) z = torch.cat((z_vae, z_sym), dim=1) else: raise NotImplementedError('fff') #obs = make_var(obs) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # # FIXME: works only for environments with sparse rewards # for idx, eps_done in enumerate(done): # if eps_done: # episode_rewards.append(reward[idx]) # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: #print('done') episode_rewards.append(infos[idx]['accumulated_reward']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) rollouts.insert(z, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps #print(len(episode_rewards)) step_log.append(total_num_steps) reward_log.append(np.mean(episode_rewards)) step_log_np = np.asarray(step_log) reward_log_np = np.asarray(reward_log) np.savez_compressed('/hdd_c/data/miniWorld/log/{}.npz'.format(UID), step=step_log_np, reward=reward_log_np) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards))) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass """ envs.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, args.custom_gym) base = SEVN actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'ppo': agent = PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode_length = deque(maxlen=10) episode_success_rate = deque(maxlen=100) episode_total = 0 start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) episode_success_rate.append( info['was_successful_trajectory']) episode_total += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() writer.add_scalars('Train/Episode Reward', { "Reward Mean": np.mean(episode_rewards), "Reward Min": np.min(episode_rewards), "Reward Max": np.max(episode_rewards) }, global_step=total_num_steps) writer.add_scalars('Train/Episode Length', { "Episode Length Mean": np.mean(episode_length), "Episode Length Min": np.min(episode_length), "Episode Length Max": np.max(episode_length) }, global_step=total_num_steps) writer.add_scalar("Train/Episode Reward Mean", np.mean(episode_rewards), global_step=total_num_steps) writer.add_scalar("Train/Episode Length Mean", np.mean(episode_length), global_step=total_num_steps) writer.add_scalar("Train/Episode Success Rate", np.mean(episode_success_rate), global_step=total_num_steps) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('args.lr') print(args.lr) # print('args.stat_decay') # print(args.stat_decay) # sys.exit() if args.algo == 'a2c': # print('args.eps') # print(args.eps) # sys.exit() agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo in ['acktr']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, stat_decay=args.stat_decay) elif args.algo in ['acktr-h**o']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, if_homo=True, stat_decay=args.stat_decay) elif args.algo in ['acktr-h**o-noEigen']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, acktr=True, if_homo=True, stat_decay=args.stat_decay, if_eigen=False) elif args.algo in ['kbfgs']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o-invertA']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay=args.stat_decay, if_invert_A=True) elif args.algo in ['kbfgs-h**o-invertA-decoupledDecay']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, stat_decay_A=args.stat_decay_A, stat_decay_G=args.stat_decay_G, if_invert_A=True, if_decoupled_decay=True) elif args.algo in ['kbfgs-h**o-momentumGrad']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, if_momentumGrad=True, stat_decay=args.stat_decay) elif args.algo in ['kbfgs-h**o-noClip']: agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, kbfgs=True, if_homo=True, if_clip=False, stat_decay=args.stat_decay) else: print('unknown args.algo for ' + args.algo) sys.exit() rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) record_rewards = [] record_num_steps = [] print('num_updates') print(num_updates) total_num_steps = 0 start = time.time() for j in range(num_updates): print('j') print(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: # print('info.keys()') # print(info.keys()) if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) print('info[episode][r]') print(info['episode']['r']) record_rewards.append(info['episode']['r']) # print('total_num_steps') # print(total_num_steps) # print('total_num_steps + (step + 1) * args.num_processes') # print(total_num_steps + (step + 1) * args.num_processes) record_num_steps.append(total_num_steps + (step + 1) * args.num_processes) # sys.exit() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, update_signal = agent.update( rollouts) if update_signal == -1: # sys.exit() break rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass print('record_rewards') print(record_rewards) dir_with_params = args.env_name + '/' +\ args.algo + '/' +\ 'eps_' + str(args.eps) + '/' +\ 'lr_' + str(args.lr) + '/' +\ 'stat_decay_' + str(args.stat_decay) + '/' # saving_dir = './result/' + args.env_name + '/' + args.algo + '/' saving_dir = './result/' + dir_with_params if not os.path.isdir(saving_dir): os.makedirs(saving_dir) import pickle with open(saving_dir + 'result.pkl', 'wb') as handle: pickle.dump( { 'record_rewards': record_rewards, 'record_num_steps': record_num_steps }, handle) print('args.log_dir') print(args.log_dir) print('os.listdir(args.log_dir)') print(os.listdir(args.log_dir)) # saving_dir_monitor = './result_monitor/' + args.env_name + '/' + args.algo + '/' saving_dir_monitor = './result_monitor/' + dir_with_params if os.path.isdir(saving_dir_monitor): import shutil shutil.rmtree(saving_dir_monitor) if not os.path.isdir(saving_dir_monitor): os.makedirs(saving_dir_monitor) print('saving_dir_monitor') print(saving_dir_monitor) import shutil for file_name in os.listdir(args.log_dir): full_file_name = os.path.join(args.log_dir, file_name) print('full_file_name') print(full_file_name) print('os.path.isfile(full_file_name)') print(os.path.isfile(full_file_name)) if os.path.isfile(full_file_name): shutil.copy(full_file_name, saving_dir_monitor) # print('os.listdir(saving_dir_monitor)') # print(os.listdir(saving_dir_monitor)) # print('len(os.listdir(saving_dir_monitor))') # print(len(os.listdir(saving_dir_monitor))) # print('args.num_processes') # print(args.num_processes) assert len(os.listdir(saving_dir_monitor)) == args.num_processes
def run(self, time, S_time_interval, S_send_data_size, S_chunk_len, S_rebuf, S_buffer_size, S_play_time_len, S_end_delay, S_decision_flag, S_buffer_flag, S_cdn_flag, end_of_video, cdn_newest_id, download_id, cdn_has_frame, IntialVars): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # The online env in AItrans, it should have the observation space, action space and so on # We should step into the depth of envs.py in the github doc, and extract the format of observation # and action space envs = actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) # choose the algorithm, now we only have a2c if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) # the initial observation obs = rollouts.obs[0].copy_(obs) rollouts.to(device) episode_reward = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step])
def main(): print('Preparing parameters') torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print('Creating envs: {}'.format(args.env_name)) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # input(envs) print('Creating network') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('Initializing PPO') agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) print('Memory') rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_episodes = [0 for _ in range(args.num_processes)] if args.run_id == "debug": try: shutil.rmtree('./runs/debug') except: pass writer = SummaryWriter("./runs/{}".format(args.run_id)) with open('./runs/{}/recap.txt'.format(args.run_id), 'w') as file: file.write(str(actor_critic)) last_index = 0 print('Starting ! ') start = time.time() for j in tqdm(range(num_updates)): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info_num, info in enumerate(infos): if (info_num == 0): if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) end_episode_to_viz(writer, info, info_num, num_episodes[info_num]) num_episodes[info_num] += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) losses = agent.update(rollouts) rollouts.after_update() losses_to_viz(writer, losses, j) create_checkpoint(actor_critic, envs, args) last_index = global_rew_to_viz(writer, last_index)
def main(): config = None args = get_args() config, checkpoint = get_config_and_checkpoint(args) set_random_seeds(args, config) eval_log_dir = args.save_dir + "_eval" try: os.makedirs(args.save_dir) os.makedirs(eval_log_dir) except OSError: pass now = datetime.datetime.now() experiment_name = args.experiment_name + '_' + now.strftime( "%Y-%m-%d_%H-%M-%S") # Create checkpoint file save_dir_model = os.path.join(args.save_dir, 'model', experiment_name) save_dir_config = os.path.join(args.save_dir, 'config', experiment_name) try: os.makedirs(save_dir_model) os.makedirs(save_dir_config) except OSError as e: logger.error(e) exit() if args.config: shutil.copy2(args.config, save_dir_config) # Tensorboard Logging writer = SummaryWriter( os.path.join(args.save_dir, 'tensorboard', experiment_name)) # Logger that writes to STDOUT and a file in the save_dir logger = setup_carla_logger(args.save_dir, experiment_name) device = torch.device("cuda:0" if args.cuda else "cpu") norm_reward = not config.no_reward_norm norm_obs = not config.no_obs_norm assert not (config.num_virtual_goals > 0) or ( config.reward_class == 'SparseReward'), 'Cant use HER with dense reward' obs_converter = CarlaObservationConverter( h=84, w=84, rel_coord_system=config.rel_coord_system) action_converter = CarlaActionsConverter(config.action_type) envs = make_vec_envs(obs_converter, action_converter, args.starting_port, config.seed, config.num_processes, config.gamma, device, config.reward_class, num_frame_stack=1, subset=config.experiments_subset, norm_reward=norm_reward, norm_obs=norm_obs, apply_her=config.num_virtual_goals > 0, video_every=args.video_interval, video_dir=os.path.join(args.save_dir, 'video', experiment_name)) if config.agent == 'forward': agent = agents.ForwardCarla() if config.agent == 'a2c': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm) elif config.agent == 'acktr': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, acktr=True) elif config.agent == 'ppo': agent = agents.PPOCarla(obs_converter, action_converter, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) if checkpoint is not None: load_modules(agent.optimizer, agent.model, checkpoint) rollouts = RolloutStorage(config.num_steps, config.num_processes, envs.observation_space, envs.action_space, 20, config.num_virtual_goals, config.rel_coord_system, obs_converter) obs = envs.reset() # Save the first observation obs = obs_to_dict(obs) rollouts.obs = obs_to_dict(rollouts.obs) for k in rollouts.obs: rollouts.obs[k][rollouts.step + 1].copy_(obs[k]) rollouts.obs = dict_to_obs(rollouts.obs) rollouts.to(device) start = time.time() total_steps = 0 total_episodes = 0 total_reward = 0 episode_reward = torch.zeros(config.num_processes) for j in range(config.num_updates): for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, info = envs.step(action) # For logging purposes carla_rewards = torch.tensor([i['carla-reward'] for i in info], dtype=torch.float) episode_reward += carla_rewards total_reward += carla_rewards.sum().item() total_steps += config.num_processes if done.any(): total_episodes += done.sum() torch_done = torch.tensor(done.astype(int)).byte() mean_episode_reward = episode_reward[torch_done].mean().item() logger.info('{} episode(s) finished with reward {}'.format( done.sum(), mean_episode_reward)) writer.add_scalar('train/mean_ep_reward_vs_steps', mean_episode_reward, total_steps) writer.add_scalar('train/mean_ep_reward_vs_episodes', mean_episode_reward, total_episodes) episode_reward[torch_done] = 0 # If done then clean the history of observations. masks = torch.FloatTensor(1 - done) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks.unsqueeze(-1)) if config.num_virtual_goals > 0: rollouts.apply_her(config.num_virtual_goals, device, beta=config.beta) with torch.no_grad(): next_value = agent.get_value( rollouts.get_obs(-1), # Get last observation rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and config.agent != 'forward': save_path = os.path.join(save_dir_model, str(j) + '.pth.tar') save_modules(agent.optimizer, agent.model, args, config, save_path) total_num_steps = (j + 1) * config.num_processes * config.num_steps if j % args.log_interval == 0: # Logging to the stdout/our logs end = time.time() logger.info('------------------------------------') logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\ .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start))) logger.info('------------------------------------') # Logging to tensorboard writer.add_scalar('train/cum_reward_vs_steps', total_reward, total_steps) writer.add_scalar('train/cum_reward_vs_updates', total_reward, j + 1) if config.agent in ['a2c', 'acktr', 'ppo']: writer.add_scalar('debug/value_loss_vs_steps', value_loss, total_steps) writer.add_scalar('debug/value_loss_vs_updates', value_loss, j + 1) writer.add_scalar('debug/action_loss_vs_steps', action_loss, total_steps) writer.add_scalar('debug/action_loss_vs_updates', action_loss, j + 1) writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy, total_steps) writer.add_scalar('debug/dist_entropy_vs_updates', dist_entropy, j + 1) # Sample the last reward writer.add_scalar('debug/sampled_normalized_reward_vs_steps', reward.mean(), total_steps) writer.add_scalar('debug/sampled_normalized_reward_vs_updates', reward.mean(), j + 1) writer.add_scalar('debug/sampled_carla_reward_vs_steps', carla_rewards.mean(), total_steps) writer.add_scalar('debug/sampled_carla_reward_vs_updates', carla_rewards.mean(), j + 1) if (args.eval_interval is not None and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.starting_port, obs_converter, args.x + config.num_processes, config.num_processes, config.gamma, eval_log_dir, config.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(config.num_processes, 20, device=device) eval_masks = torch.zeros(config.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = agent.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs carla_obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() logger.info( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def main(): writer = SummaryWriter() torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") best_score = 0 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, 4, args.carl_wrapper) actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.activation, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) assert (args.algo == 'a2c') if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) beta_device = (torch.ones(args.num_processes, 1)).to(device) masks_device = torch.ones(args.num_processes, 1).to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() obs = obs / 255 rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) g_step = 0 for j in range(num_updates): for step in range(args.num_steps): # sample actions g_step += 1 eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp( -1. * g_step / EPS_DECAY) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, ori_dist_entropy = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], deterministic=True) ori_dist_entropy = ori_dist_entropy.cpu().unsqueeze(1) # select action based on epsilon greedy rand_val = torch.rand(action.shape).to(device) eps_mask = (rand_val >= eps_threshold).type(torch.int64) rand_action = torch.LongTensor([ envs.action_space.sample() for i in range(args.num_processes) ]).unsqueeze(1).to(device) action = eps_mask * action + (1 - eps_mask) * rand_action obs, reward, done, infos = envs.step(action) obs = obs / 255 masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.log_evaluation: writer.add_scalar('analysis/reward', reward[0], g_step) writer.add_scalar('analysis/entropy', ori_dist_entropy[0].item(), g_step) writer.add_scalar('analysis/eps', eps_threshold, g_step) if done[0]: writer.add_scalar('analysis/done', 1, g_step) # save model for idx in range(len(infos)): info = infos[idx] if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) steps_done = g_step * args.num_processes + idx writer.add_scalar('data/reward', info['episode']['r'], steps_done) mean_rewards = np.mean(episode_rewards) writer.add_scalar('data/avg_reward', mean_rewards, steps_done) if mean_rewards > best_score: best_score = mean_rewards save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save( save_model, os.path.join(save_path, args.env_name + ".pt")) # update storage rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, beta_device) with torch.no_grad(): masks_device.copy_(masks) next_value = actor_critic.get_value(obs, recurrent_hidden_states, masks_device) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards))) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass """ envs.close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") run_id = "alpha{}".format(args.gcn_alpha) if args.use_logger: from utils import Logger folder = "{}/{}".format(args.folder, run_id) logger = Logger(algo_name=args.algo, environment_name=args.env_name, folder=folder, seed=args.seed) logger.save_args(args) print("---------------------------------------") print('Saving to', logger.save_folder) print("---------------------------------------") else: print("---------------------------------------") print('NOTE : NOT SAVING RESULTS') print("---------------------------------------") all_rewards = [] envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.env_name, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, actor_critic.base.output_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) ############################ # GCN Model and optimizer from pygcn.train import update_graph from pygcn.models import GCN, GAT, SAGE assert args.gnn in ['gcn', 'gat', 'sage'] if args.gnn == 'gat': gcn_model = GAT(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) elif args.gnn == 'sage': gcn_model = SAGE(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) elif args.gnn == 'gcn': gcn_model = GCN(nfeat=actor_critic.base.output_size, nhid=args.gcn_hidden) gcn_model.to(device) gcn_optimizer = optim.Adam(gcn_model.parameters(), lr=args.gcn_lr, weight_decay=args.gcn_weight_decay) gcn_loss = nn.NLLLoss() gcn_states = [[] for _ in range(args.num_processes)] Gs = [nx.Graph() for _ in range(args.num_processes)] node_ptrs = [0 for _ in range(args.num_processes)] rew_states = [[] for _ in range(args.num_processes)] ############################ episode_rewards = deque(maxlen=100) avg_fwdloss = deque(maxlen=100) rew_rms = RunningMeanStd(shape=()) delay_rew = torch.zeros([args.num_processes, 1]) delay_step = torch.zeros([args.num_processes]) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob,\ recurrent_hidden_states, hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) delay_rew += reward delay_step += 1 for idx, (info, hid, eps_done) in enumerate(zip(infos, hidden_states, done)): if eps_done or delay_step[idx] == args.reward_freq: reward[idx] = delay_rew[idx] delay_rew[idx] = delay_step[idx] = 0 else: reward[idx] = 0 if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if args.gcn_alpha < 1.0: gcn_states[idx].append(hid) node_ptrs[idx] += 1 if not eps_done: Gs[idx].add_edge(node_ptrs[idx] - 1, node_ptrs[idx]) if reward[idx] != 0. or eps_done: rew_states[idx].append( [node_ptrs[idx] - 1, reward[idx]]) if eps_done: adj = nx.adjacency_matrix(Gs[idx]) if len(Gs[idx].nodes)\ else sp.csr_matrix(np.eye(1,dtype='int64')) update_graph(gcn_model, gcn_optimizer, torch.stack(gcn_states[idx]), adj, rew_states[idx], gcn_loss, args, envs) gcn_states[idx] = [] Gs[idx] = nx.Graph() node_ptrs[idx] = 0 rew_states[idx] = [] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, hidden_states) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau, gcn_model, args.gcn_alpha) agent.update(rollouts) rollouts.after_update() ####################### Saving and book-keeping ####################### if (j % int(num_updates / 5.) == 0 or j == num_updates - 1) and args.save_dir != "": print('Saving model') print() save_dir = "{}/{}/{}".format(args.save_dir, args.folder, run_id) save_path = os.path.join(save_dir, args.algo, 'seed' + str(args.seed)) + '_iter' + str(j) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_gcn = gcn_model if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_gcn = copy.deepcopy(gcn_model).cpu() save_model = [ save_gcn, save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + "ac.pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {}\ training episodes: mean/median reward {:.2f}/{:.2f},\ min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards), )) all_rewards.append(np.mean(episode_rewards)) if args.use_logger: logger.save_task_results(all_rewards) ####################### Saving and book-keeping ####################### envs.close()
def main(): torch.manual_seed(args_seed) torch.cuda.manual_seed_all(args_seed) device = torch.device("cuda:0" if args_cuda else "cpu") train_log = Log(log_name+'_train_log') evl_log = Log(log_name+'_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes, device, gamma=args_gamma) # norm_envs = get_vec_normalize(envs) # norm_envs = envs # norm_envs.eval() # norm_envs.ob_rms = 1 # print(envs.ob_rms) # ss('hi') if is_limit_action: envs.action_space.n = 3 print('Number of Actions:', envs.action_space.n) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_recurrent_policy}) actor_critic.to(device) # print(actor_critic.is_recurrent) # print(actor_critic.gru) # ss('hi') agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm, use_clipped_value_loss=args_use_clipped_value_loss) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): if args_use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, args_lr) for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # ss('dissecting actor critic. act') # print(action) # print() # action = action + 1 # print(action) # ss('hoiohasdfhioas') if is_limit_action: obs, reward, done, infos = envs.step(action+1) else: obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_gamma, args_use_gae, args_gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes, device, is_limit_action=is_limit_action) ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result evl_log.log(ev_log_string)
def main(): writer = SummaryWriter() torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") best_score = 0 if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None if args.reward_mode == 0: clip_rewards = True else: clip_rewards = False envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False, 4, args.carl_wrapper, clip_rewards, args.track_primitive_reward) actor_critic = Policy(envs.observation_space.shape, envs.action_space, args.activation, args.complex_model, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) # initiate env and storage rollout obs = envs.reset() obs = obs/255 rollouts.obs[0].copy_(obs) rollouts.to(device) # necessary variabels episode_rewards = deque(maxlen=10) # store last 10 episode rewards g_step = 0 # global step reward_history = set() # record reward history (after reward rescaling) primitive_reward_history = set() # record original history (before reward rescaling) min_abs_reward = float('inf') # used in reward rescaling mode 2, work as a base masks_device = torch.ones(args.num_processes, 1).to(device) # mask on gpu reward_count = 0 # for reward density calculation reward_start_step = 0 # for reward density calculation insert_entropy = torch.ones(args.num_processes, 1) # entropys inserte into rollout avg_entropy = 0 have_done = 0.0 num_feature_neurons = args.num_processes * 512 for j in range(num_updates): if j == int((num_updates-1)*have_done): if args.save_intermediate_model: save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + str(have_done)+".pt")) print("have done: ", have_done) have_done += 0.1 for step in range(args.num_steps): # Sample actions g_step += 1 with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, entropy, f_a = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.track_hidden_stats: # analyze the stats of f_a mean_fa = torch.mean(f_a) num_nonzero = f_a.nonzero().size(0) mean_pos = mean_fa * num_feature_neurons / num_nonzero activation_ratio = f_a / mean_pos num_bigger_mean_fa = torch.sum(activation_ratio > 1).item() num_bigger_half_fa = torch.sum(activation_ratio > 0.5).item() writer.add_scalar('analysis/fa_mean_ratio', (num_nonzero - num_bigger_mean_fa)/num_nonzero, g_step) writer.add_scalar('analysis/fa_0.5_ratio', (num_nonzero - num_bigger_half_fa)/num_nonzero, g_step) writer.add_scalar('analysis/fa_active', num_nonzero/num_feature_neurons, g_step) # analyze the stats of entropy avg_entropy = 0.999*avg_entropy + 0.001*torch.mean(entropy).item() num_all = len(entropy.view(-1)) entropy_ratio = entropy/avg_entropy num_larger_mean = sum(entropy_ratio > 1).item() num_larger_onehalf = sum(entropy_ratio > 1.5).item() num_larger_double = sum(entropy_ratio > 2).item() writer.add_scalar('analysis/entropy_mean_ratio', num_larger_mean/num_all, g_step) writer.add_scalar('analysis/entropy_1.5_ratio', num_larger_onehalf/num_all, g_step) writer.add_scalar('analysis/entropy_2_ratio', num_larger_double/num_all, g_step) # update entropy inserted into rollout when appropriate if args.modulation and j > args.start_modulate * num_updates: insert_entropy = entropy.unsqueeze(1) # Obser reward and next obs obs, reward, done, infos = envs.step(action) obs = obs/255 # reward rescaling if args.reward_mode == 1: reward = reward * args.reward_scale elif args.reward_mode == 2: if j < args.change_base_reward * num_updates: non_zeros = abs(reward[reward != 0]) if len(non_zeros) > 0: min_abs_reward_step = torch.min(non_zeros).item() if min_abs_reward > min_abs_reward_step: min_abs_reward = min_abs_reward_step print('new min abs reward: ', min_abs_reward, ' time: ', g_step) if min_abs_reward != float('inf'): reward = reward/min_abs_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.log_evaluation: writer.add_scalar('analysis/entropy', entropy.mean().item(), g_step) if args.track_reward_density: # track reward density, based on 0th process reward_count += (reward[0] != 0) if 'episode' in infos[0].keys(): writer.add_scalar('analysis/reward_density', reward_count/(g_step - reward_start_step), g_step) reward_count = 0 reward_start_step = g_step if args.track_primitive_reward: # track primitive reward (before rescaling) for info in infos: if 'new_reward' in info: new_rewards = info['new_reward'] - primitive_reward_history if len(new_rewards) > 0: print('new primitive rewards: ', new_rewards, ' time: ', g_step) primitive_reward_history = primitive_reward_history.union(info['new_reward']) if args.track_scaled_reward: # track rewards after rescaling for r in reward: r = r.item() if r not in reward_history: print('new step rewards: ', r, g_step) reward_history.add(r) for idx in range(len(infos)): info = infos[idx] if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) steps_done = g_step*args.num_processes + idx writer.add_scalar('data/reward', info['episode']['r'], steps_done) mean_rewards = np.mean(episode_rewards) writer.add_scalar('data/avg_reward', mean_rewards, steps_done) if mean_rewards > best_score: best_score = mean_rewards save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, insert_entropy) with torch.no_grad(): masks_device.copy_(masks) next_value = actor_critic.get_value(obs, recurrent_hidden_states, masks_device) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy, value = agent.update(rollouts, args.modulation) if args.track_value_loss: writer.add_scalar('analysis/value_loss', value_loss, j) writer.add_scalar('analysis/value', value, j) writer.add_scalar('analysis/loss_ratio', value_loss/value, j) if args.modulation and args.track_lr and args.log_evaluation: writer.add_scalar('analysis/min_lr', torch.min(rollouts.lr).item(), j) writer.add_scalar('analysis/max_lr', torch.max(rollouts.lr).item(), j) writer.add_scalar('analysis/std_lr', torch.std(rollouts.lr).item(), j) writer.add_scalar('analysis/avg_lr', torch.mean(rollouts.lr).item(), j) rollouts.after_update() writer.export_scalars_to_json("./all_scalars.json") writer.close()
rollouts = RolloutStorage(num_steps=OUTER_BATCHSIZE, num_processes=NUM_PROCESS, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=1) inner_rollouts = RolloutStorage(num_steps=INNER_BATCHSIZE, num_processes=NUM_PROCESS, obs_shape=envs.observation_space.shape, action_space=envs.action_space, recurrent_hidden_state_size=1) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) inner_rollouts.obs[0].copy_(obs) inner_rollouts.to(device) episode_rewards = deque(maxlen=10) total_num_steps = 0 def select_action(obs): with torch.no_grad(): action_mean, log_std = actor(obs) action = torch.normal(action_mean, torch.exp(log_std)) var = torch.exp(log_std)**2 action_log_probs = -( (action - action_mean)**2) / (2 * var) - log_std - math.log( math.sqrt(2 * math.pi)) action_log_probs = action_log_probs.sum(1, keepdim=True)
class Runner(): def __init__(self, **args): cuda = not args['no_cuda'] and torch.cuda.is_available() self.device = torch.device("cuda:0" if cuda else "cpu") print("Model running on device: {}".format(self.device)) torch.set_num_threads(1) self.env_name = args['env_name'] self.epochs = args['epochs'] self.num_processes = args['num_processes'] self.num_steps = args['num_steps'] self.num_test_episodes = args['num_test_episodes'] self.test_every_n_epochs = args['test_every_n_epochs'] self.use_deterministic_policy_while_testing = args['use_deterministic_policy_while_testing'] self.grayscale = args['grayscale'] self.skip_frame = args['skip_frame'] self.num_frame_stack = args['num_frame_stack'] self.num_updates_per_epoch = args['num_updates_per_epoch'] self.num_steps = args['num_steps'] self.use_gae = args['use_gae'] self.gamma = args['gamma'] self.tau = args['tau'] self.reward_scaling = args['reward_scaling'] self.seed = args['seed'] self.log_dir = args['log_dir'] self.save_dir = args['save_dir'] try: os.makedirs(args['log_dir']) files = glob.glob(os.path.join(args['log_dir'], '*.manifest.json')) for f in files: os.remove(f) except OSError: files = glob.glob(os.path.join(args['log_dir'], '*.monitor.csv')) for f in files: os.remove(f) self.eval_log_dir = args['log_dir'] + "_eval" try: os.makedirs(self.eval_log_dir) except OSError: files = glob.glob(os.path.join(self.eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) self.envs = make_vec_envs(self.env_name, self.seed, self.num_processes, self.gamma, self.log_dir, self.device, False, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack) self.algorithm = args['algorithm'] # Decreasing LR scheduler self.scheduler = None if self.algorithm == 'A2C': actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = A2C(actor_critic, **args['algorithm_parameters']) elif self.algorithm == 'PPO': if(args['decreasing_lr']): def lambdalr(epoch): return ((float(self.epochs - epoch)) / float(self.epochs) * args['algorithm_parameters']['lr']) # noqa: E704 actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = PPO(actor_critic, lambdalr, ** args['algorithm_parameters']) self.scheduler = self.agent.scheduler else: actor_critic = ActorCriticNetwork(self.envs.observation_space.shape, self.envs.action_space, base_kwargs=args['policy_parameters']) actor_critic.to(self.device) self.policy = actor_critic self.agent = PPO(actor_critic, None, ** args['algorithm_parameters']) self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, actor_critic.recurrent_hidden_state_size) obs = self.envs.reset() self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) self.episode_rewards = deque(maxlen=50) self.writer = SummaryWriter( comment="{}-{}".format(self.env_name, self.algorithm)) def run(self): start = time.time() for epoch in range(self.epochs): value_losses, action_losses, dist_entropies = [], [], [] print("\nEpoch %d\n-------" % (epoch + 1)) for j in trange(self.num_updates_per_epoch, leave=False): for step in range(self.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = self.policy.act( self.rollouts.obs[step], self.rollouts.recurrent_hidden_states[step], self.rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = self.envs.step(action) for info in infos: if 'episode' in info.keys(): print("New episode") self.episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) self.rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = self.policy.get_value(self.rollouts.obs[-1], self.rollouts.recurrent_hidden_states[-1], self.rollouts.masks[-1]).detach() self.rollouts.compute_returns( next_value, self.use_gae, self.gamma, self.tau) value_loss, action_loss, dist_entropy = self.agent.update( self.rollouts) value_losses.append(value_loss) action_losses.append(action_loss) dist_entropies.append(dist_entropy) self.rollouts.after_update() total_num_steps = (epoch + 1) * (j + 1) * \ self.num_processes * self.num_steps end = time.time() print("Total timesteps: {}, FPS: {}".format( total_num_steps, int(total_num_steps / (end - start)))) print("Statistic of the last %d episodes played" % len(self.episode_rewards)) if(len(self.episode_rewards) < 1): self.episode_rewards.append(0) episode_rewards_np = np.array(self.episode_rewards) value_losses = np.array(value_losses) action_losses = np.array(action_losses) dist_entropies = np.array(dist_entropies) print("Mean value loss: {}, Mean action loss: {}, Mean entropy: {}".format( value_losses.mean(), action_losses.mean(), dist_entropies.mean())) print(episode_rewards_np) print("Results: mean: {} +/- {}".format(np.mean(episode_rewards_np), np.std(episode_rewards_np))) print("Min: {}, Max: {}, Median: {}".format(np.min(episode_rewards_np), np.max(episode_rewards_np), np.median(episode_rewards_np))) self.writer.add_scalar( 'value_loss/mean', value_losses.mean(), epoch) self.writer.add_scalar( 'action_loss/mean', action_losses.mean(), epoch) self.writer.add_scalar( 'dist_entropy/mean', dist_entropies.mean(), epoch) self.writer.add_scalar( 'reward/mean', episode_rewards_np.mean(), epoch) self.writer.add_scalar( 'reward/max', episode_rewards_np.max(), epoch) self.writer.add_scalar( 'reward/min', episode_rewards_np.min(), epoch) if (epoch + 1) % self.test_every_n_epochs == 0: print("\nTesting...") bar = tqdm(total=self.num_test_episodes, leave=False) eval_envs = make_vec_envs(self.env_name, self.seed + self.num_processes, self.num_processes, self.gamma, self.eval_log_dir, self.device, True, self.grayscale, self.skip_frame, self.reward_scaling, num_frame_stack=self.num_frame_stack) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(self.envs).ob_rm eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(self.num_processes, self.policy.recurrent_hidden_state_size, device=self.device) eval_masks = torch.zeros( self.num_processes, 1, device=self.device) while len(eval_episode_rewards) < self.num_test_episodes: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = self.policy.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=self.use_deterministic_policy_while_testing) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): bar.update(1) eval_episode_rewards.append( info['episode']['r']) eval_envs.close() bar.close() print(eval_episode_rewards) print(" Evaluation using {} episodes: mean reward {:.5f}, min/max {}/{}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards), np.min(eval_episode_rewards), np.max(eval_episode_rewards))) print("Total elapsed time: %.2f minutes" % ((time.time() - start) / 60.0)) if self.scheduler is not None: print("Decreasing the learning rate...") self.scheduler.step() print("Saving the model...") save_path = os.path.join(self.save_dir, self.algorithm) try: os.makedirs(save_path) except OSError: pass save_model = self.policy if self.device == "cuda:0": save_model = copy.deepcopy(self.policy).cpu() save_model = [save_model, getattr(get_vec_normalize(self.envs), 'ob_rms', None)] torch.save(save_model, os.path.join( save_path, self.env_name + ".pt"))