def __init__(self, num_envs=1, log_dir="", suffix=""): self.resized_dim = 42 env = make_envs(num_envs=1, resized_dim=self.resized_dim) self.obs_shape = env.observation_space.shape self.agent = PPOTrainer(env, ppo_config) if log_dir: # log_dir is None only in testing self.agent.load_w(log_dir, suffix) self.num_envs = num_envs self.frame_stack = FrameStackTensor(self.num_envs, self.obs_shape, 4, self.agent.device)
def test(): # Run this function to make sure your API is runnable policy_names = [ function_name for function_name in locals() if function_name.startswith("my_policy") ] assert len( policy_names) == 1, "Found {}, the potential policies {}".format( dir(), policy_names) policy_name = policy_names[0] policy_creator = locals()[policy_name] num_envs = 1 policy = policy_creator(num_envs) env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False) o = env.reset() for i in range(1000): a = policy(o) assert np.asarray(a).shape == (num_envs, 2) assert env.action_space.contains(a[0]) o, _, d, _ = env.step(a) if d: o = env.reset() env.close() num_envs = 3 policy = policy_creator(num_envs) env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False) o = env.reset() for i in range(1000): a = policy(o) assert np.asarray(a).shape == (num_envs, 2) o, _, d, _ = env.step(a) if d: o = env.reset() env.close() print("Test passed!")
def generate_data(rollouts, data_dir, noise_type): # pylint: disable=R0914 """ Generates data """ assert exists(data_dir), "The data directory does not exist..." from competitive_rl import make_envs # env = gym.make("CarRacing-v0") env = make_envs(env_id='cCarRacing-v0', seed=100, log_dir='data/dataset/', num_envs=1, asynchronous=False, resized_dim=96, action_repeat=1) seq_len = 1000 for i in range(rollouts): env.reset() a_rollout = sample_continuous_policy(env.action_space, seq_len, 1. / 50) s_rollout = [] r_rollout = [] d_rollout = [] t = 0 while True: action = a_rollout[t] t += 1 # import pdb; pdb.set_trace() obs, r, done, _ = env.step(action.reshape(1, -1)) # for ii in range(4): # plt.figure() # _obs = obs[0,ii,...] # plt.imshow(_obs, cmap='gray', vmin=0, vmax=255) # plt.savefig(f'./vis_env/{t}_{ii}.png') s_rollout += [obs[0]] r_rollout += [r] d_rollout += [done] if done: print("> End of rollout {}, {} frames...".format( i, len(s_rollout))) np.savez(join(data_dir, 'rollout_{}'.format(i)), observations=np.array(s_rollout), rewards=np.array(r_rollout), actions=np.array(a_rollout), terminals=np.array(d_rollout)) break
def __init__(self, env_id, num_envs=1, log_dir=None, suffix=None, _test=False): # self.resized_dim = 42 env = make_envs(env_id=env_id, num_envs=1) self.obs_shape = env.observation_space.shape self.agent = PPOTrainer(env, ppo_config) if log_dir is not None: # log_dir is None only in testing success = self.agent.load_w(log_dir, suffix) if not success and not _test: raise ValueError("Failed to load agent!") self.num_envs = num_envs
# test() # Run this function to make sure your API is runnable policy_names = [ function_name for function_name in locals() if function_name.startswith("my_policy") ] assert len( policy_names) == 1, "Found {}, the potential policies {}".format( dir(), policy_names) policy_name = policy_names[0] policy_creator = locals()[policy_name] num_envs = 1 policy = policy_creator(num_envs) env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False) o = env.reset() for i in range(1000): a = [policy(o)] o, _, d, _ = env.step(a) if d: o = env.reset() env.close() num_envs = 3 policy = policy_creator(num_envs) env = make_envs("cCarRacing-v0", num_envs=num_envs, asynchronous=False) o = env.reset() for i in range(1000): a = policy(o) o, _, d, _ = env.step(a)
from competitive_rl import make_envs if __name__ == '__main__': envs = make_envs( env_id="cCarRacing-v0", seed=0, log_dir="demo", # this will create a "demo" directory num_envs=5, asynchronous=True, resized_dim=42) obs = envs.reset() print(obs.shape) envs.close()
type=int, help="The number of episodes to run. Default: 100") args = parser.parse_args() num_episodes = args.num_episodes num_envs = args.num_envs agents = { l: get_compute_action_function(l, num_envs) for l in get_builtin_agent_names() } agents["MY_AGENT"] = student_compute_action_function(num_envs) print("All agents ready: ", agents.keys()) envs = make_envs("cPongDouble-v0", num_envs=num_envs, asynchronous=True) print("Environment ready") result = launch("MY_AGENT", student_compute_action_function(num_envs), agents, envs, num_episodes) winning_rate_matrix, reward_matrix = build_matrix(result, single_line=True) print("\n===== Winning Rate Matrix (row vs column) =====") print(winning_rate_matrix) print("\n===== Reward Matrix (row vs column) =====") print(reward_matrix) with open("data/evaluate_result.md", "w") as f: f.write("winning rate matrix:\n\n") f.write( tabulate.tabulate(winning_rate_matrix,
default=3, type=int, help="Number of episodes to run.") args = parser.parse_args() # collect builtin agents agent_names = get_builtin_agent_names() + ["MY_AGENT"] print("Agent names: ", agent_names) print("Your chosen agents: left - {}, right - {}".format( args.left, args.right)) assert args.left in agent_names, agent_names assert args.right in agent_names, agent_names # create env and setup policies env = make_envs("cPongDouble-v0", num_envs=1, asynchronous=False, log_dir="tmp_vis").envs[0] left = get_compute_action_function(args.left) right = get_compute_action_function(args.right) # evaluate result = evaluate_two_policies( left, right, env=env, render=True, num_episode=args.num_episodes, render_interval=0.05 # 20 FPS rendering ) print(result)
def __init__(self, crop=True, grass_penalty=0, action_repeat=1): comp_envs = make_envs("cCarRacing-v0", num_envs=1, action_repeat=1) self.env = comp_envs.envs[0] super().__init__(crop, grass_penalty, action_repeat)
def train(args): # Verify algorithm and config algo = args.algo if algo == "PPO": config = ppo_config elif algo == "A2C": config = a2c_config else: raise ValueError("args.algo must in [PPO, A2C]") config.num_envs = args.num_envs assert args.env_id in ["cPong-v0", "CartPole-v0", "cPongTournament-v0"] # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Clean log directory log_dir = verify_log_dir(args.log_dir, algo) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=True, resized_dim=config.resized_dim ) eval_envs = make_envs( env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=False, resized_dim=config.resized_dim ) test = env_id == "CartPole-v0" tournament = env_id == "cPongTournament-v0" frame_stack = 4 if not test else 1 if tournament: assert algo == "PPO", "Using PPO in tournament is a good idea, " \ "because of its efficiency compared to A2C." # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config, frame_stack, _test=test) else: trainer = A2CTrainer(envs, config, frame_stack, _test=test) # Create a placeholder tensor to help stack frames in 2nd dimension # That is turn the observation from shape [num_envs, 1, 84, 84] to # [num_envs, 4, 84, 84]. frame_stack_tensor = FrameStackTensor( num_envs, envs.observation_space.shape, frame_stack, config.device) # Setup some stats helpers episode_rewards = np.zeros([num_envs, 1], dtype=np.float) total_episodes = total_steps = iteration = 0 reward_recorder = deque(maxlen=100) episode_length_recorder = deque(maxlen=100) sample_timer = Timer() process_timer = Timer() update_timer = Timer() total_timer = Timer() progress = [] evaluate_stat = {} # Start training print("Start training!") obs = envs.reset() frame_stack_tensor.update(obs) trainer.rollouts.observations[0].copy_(frame_stack_tensor.get()) while True: # Break when total_steps exceeds maximum value # ===== Sample Data ===== with sample_timer: for index in range(config.num_steps): # Get action # [TODO] Get the action # Hint: # 1. Remember to disable gradient computing # 2. trainer.rollouts is a storage containing all data # 3. What observation is needed for trainer.compute_action? with torch.no_grad(): values, actions, action_log_prob = trainer.compute_action(trainer.rollouts.observations[index]) cpu_actions = actions.view(-1).cpu().numpy() # Step the environment # (Check step_envs function, you need to implement it) obs, reward, done, info, masks, total_episodes, \ total_steps, episode_rewards = step_envs( cpu_actions, envs, episode_rewards, frame_stack_tensor, reward_recorder, episode_length_recorder, total_steps, total_episodes, config.device, test) rewards = torch.from_numpy( reward.astype(np.float32)).view(-1, 1).to(config.device) # Store samples trainer.rollouts.insert( frame_stack_tensor.get(), actions.view(-1, 1), action_log_prob, values, rewards, masks) # ===== Process Samples ===== with process_timer: with torch.no_grad(): next_value = trainer.compute_values( trainer.rollouts.observations[-1]) trainer.rollouts.compute_returns(next_value, config.GAMMA) # ===== Update Policy ===== with update_timer: policy_loss, value_loss, dist_entropy, total_loss = \ trainer.update(trainer.rollouts) trainer.rollouts.after_update() # ===== Reset opponent if in tournament mode ===== if tournament and iteration % config.num_steps == 0: # Randomly choose one agent in each iteration envs.reset_opponent() # ===== Evaluate Current Policy ===== if iteration % config.eval_freq == 0: eval_timer = Timer() evaluate_rewards, evaluate_lengths = evaluate( trainer, eval_envs, frame_stack, 20) evaluate_stat = summary(evaluate_rewards, "episode_reward") if evaluate_lengths: evaluate_stat.update( summary(evaluate_lengths, "episode_length")) evaluate_stat.update(dict( win_rate=float( sum(np.array(evaluate_rewards) >= 0) / len( evaluate_rewards)), evaluate_time=eval_timer.now, evaluate_iteration=iteration )) # ===== Log information ===== if iteration % config.log_freq == 0: stats = dict( log_dir=log_dir, frame_per_second=int(total_steps / total_timer.now), training_episode_reward=summary(reward_recorder, "episode_reward"), training_episode_length=summary(episode_length_recorder, "episode_length"), evaluate_stats=evaluate_stat, learning_stats=dict( policy_loss=policy_loss, entropy=dist_entropy, value_loss=value_loss, total_loss=total_loss ), total_steps=total_steps, total_episodes=total_episodes, time_stats=dict( sample_time=sample_timer.avg, process_time=process_timer.avg, update_time=update_timer.avg, total_time=total_timer.now, episode_time=sample_timer.avg + process_timer.avg + update_timer.avg ), iteration=iteration ) if tournament: stats["opponent"] = envs.current_agent_name progress.append(stats) pretty_print({ "===== {} Training Iteration {} =====".format( algo, iteration): stats }) if iteration % config.save_freq == 0: trainer_path = trainer.save_w(log_dir, "iter{}".format(iteration)) progress_path = save_progress(log_dir, progress) print("Saved trainer state at <{}>. Saved progress at <{}>.".format( trainer_path, progress_path )) # [TODO] Stop training when total_steps is greater than args.max_steps if total_steps > args.max_steps: break iteration += 1 trainer.save_w(log_dir, "final") envs.close()
def train(args): # Verify algorithm and config algo = args.algo if algo == "PPO": config = ppo_config else: raise ValueError("args.algo must in [PPO]") config.num_envs = args.num_envs config.lr = args.lr config.entropy_loss_weight = args.entropy assert args.env_id in ["cPong-v0", "cCarRacing-v0"], args.env_id # Seed the environments and setup torch seed = args.seed torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_num_threads(1) # Create vectorized environments num_envs = args.num_envs env_id = args.env_id if not args.opponent else "cCarRacingDouble-v0" # Clean log directory log_dir = verify_log_dir( args.log_dir, "{}_{}_{}".format(env_id, algo, datetime.datetime.now().strftime("%m-%d_%H-%M"))) if args.opponent: assert args.num_eval_envs == 0 from competitive_rl.car_racing import make_competitive_car_racing from load_agents import PolicyAPI restore_log_dir = os.path.dirname(args.restore) restore_suffix = os.path.basename( args.restore).split("checkpoint-")[1].split(".pkl")[0] opponent_policy = PolicyAPI("cCarRacing-v0", num_envs=1, log_dir=restore_log_dir, suffix=restore_suffix) envs = make_competitive_car_racing(opponent_policy=opponent_policy, num_envs=num_envs, asynchronous=not args.test) else: envs = make_envs(env_id=env_id, seed=seed, log_dir=log_dir, num_envs=num_envs, asynchronous=not args.test, resized_dim=config.resized_dim, action_repeat=args.action_repeat) if args.num_eval_envs > 0: eval_envs = make_envs(env_id=env_id, seed=seed, log_dir=log_dir, num_envs=args.num_eval_envs, asynchronous=not args.test, resized_dim=config.resized_dim, action_repeat=args.action_repeat) else: eval_envs = None # Setup trainer if algo == "PPO": trainer = PPOTrainer(envs, config) else: raise ValueError("Unknown algorithm {}".format(algo)) if args.restore: restore_log_dir = os.path.dirname(args.restore) restore_suffix = os.path.basename( args.restore).split("checkpoint-")[1].split(".pkl")[0] success = trainer.load_w(restore_log_dir, restore_suffix) if not success: raise ValueError( "We can't restore your agent. The log_dir is {} and the suffix is {}" .format(restore_log_dir, restore_suffix)) # Start training print("Start training!") obs = envs.reset() # frame_stack_tensor.update(obs) raw_obs = trainer.process_obs(obs) processed_obs = trainer.model.world_model(raw_obs) trainer.rollouts.before_update(obs, processed_obs) try: _train(trainer, envs, eval_envs, config, num_envs, algo, log_dir, False, False) except KeyboardInterrupt: print( "The training is stopped by user. The log directory is {}. Now we finish the training." .format(log_dir)) trainer.save_w(log_dir, "final") envs.close()
def test_base_trainer(): from competitive_rl import make_envs class FakeConfig: def __init__(self): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_envs = 1 self.num_steps = 200 self.gamma = 0.99 self.lr = 5e-4 class FakeTrainer(BaseTrainer): def setup_optimizer(self): pass def setup_rollouts(self): pass # ===== Discrete case ===== env = make_envs("cPong-v0", asynchronous=False, num_envs=3) trainer = FakeTrainer(env, FakeConfig()) obs = env.reset() # Input single observation values, actions, action_log_probs = trainer.compute_action( obs[0], deterministic=True) new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions( obs[0], actions) assert actions.shape == (1, 1), actions.shape assert values.shape == (1, 1), values.shape assert action_log_probs.shape == (1, 1), action_log_probs.shape assert dist_entropy.shape == () assert (values == new_values).all() assert (action_log_probs == new_action_log_probs).all() assert dist_entropy.shape == () # Input multiple observations values, actions, action_log_probs = trainer.compute_action( obs, deterministic=False) new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions( obs, actions) assert actions.shape == (3, 1), actions.shape assert values.shape == (3, 1), values.shape assert action_log_probs.shape == (3, 1), action_log_probs.shape assert dist_entropy.shape == () assert (values == new_values).all() assert (action_log_probs == new_action_log_probs).all() assert dist_entropy.shape == () print("Base trainer discrete case test passed!") env.close() # ===== Continuous case ===== env = make_envs("cCarRacing-v0", asynchronous=False, num_envs=3) trainer = FakeTrainer(env, FakeConfig()) obs = env.reset() # Input single observation values, actions, action_log_probs = trainer.compute_action( obs[0], deterministic=True) new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions( obs[0], actions) assert actions.shape == (1, 2), actions.shape assert values.shape == (1, 1), values.shape assert action_log_probs.shape == (1, 1), action_log_probs.shape assert dist_entropy.shape == () assert (values == new_values).all() assert (action_log_probs == new_action_log_probs).all() assert dist_entropy.shape == () # Input multiple observations values, actions, action_log_probs = trainer.compute_action( obs, deterministic=False) new_values, new_action_log_probs, dist_entropy = trainer.evaluate_actions( obs, actions) assert actions.shape == (3, 2), actions.shape assert values.shape == (3, 1), values.shape assert action_log_probs.shape == (3, 1), action_log_probs.shape assert dist_entropy.shape == () assert (values == new_values).all() assert (action_log_probs == new_action_log_probs).all() assert dist_entropy.shape == () print("Base trainer continuous case test passed!") env.close()
# ===== Load student policies ===== student_function_names = [ function_name for function_name in dir(my_policy) if function_name.startswith("my_policy") ] assert student_function_names student_functions = {} for f in student_function_names: studnet_policy_creator = my_policy.__dict__[f] studnet_id = f.split("my_policy_")[-1] student_functions[studnet_id] = studnet_policy_creator(num_envs) print("Collected policies: ", student_functions.keys()) # ===== Setup environment ===== # envs = make_envs("CompetitivePongDouble-v0", num_envs=num_envs, asynchronous=True) seed = np.random.randint(10000) envs = make_envs("cCarRacingDouble-v0", num_envs=num_envs, asynchronous=True, seed=seed) print("Environment ready") # ===== Run Matches ===== visited_agent = set() result_list = [] for name, policy in student_functions.items(): # Remove repeat agents opponent_functions = student_functions.copy() for opponent in visited_agent: opponent_functions.pop(opponent) print("Start match between agent {} with {}.".format(name, opponent_functions.keys())) result = launch(name, policy, opponent_functions, envs, num_episodes) result_list.append(result)
from competitive_rl import make_envs if __name__ == '__main__': envs = make_envs( env_id="cPong-v0", seed=0, log_dir="demo", # this will create a "demo" directory num_envs=1, asynchronous=False, resized_dim=42 ) env = envs.envs[0] obs = envs.reset() env.close() print(obs.shape) # envs.close()
"-N", default=10, type=int, help="Number of episodes to run.") args = parser.parse_args() agent_names = get_builtin_agent_names() + ["MY_AGENT"] print("Agent names: ", agent_names) print("Your chosen agents: left - {}, right - {}".format( args.left, args.right)) assert args.left in agent_names, agent_names assert args.right in agent_names, agent_names env = make_envs("cPongDouble-v0", num_envs=1, asynchronous=False).envs[0] if args.left != "MY_AGENT": left = get_compute_action_function(args.left) else: left = student_compute_action_function() if args.right != "MY_AGENT": right = get_compute_action_function(args.right) else: right = student_compute_action_function() result = evaluate_two_policies( left, right, env=env, render=False,
def __init__(self): # self.env = gym.make('CarRacing-v0') comp_envs = make_envs('cCarRacing-v0', num_envs=1, action_repeat=1) self.env = comp_envs.envs[0] self.env.seed(args.seed) self.reward_threshold = 910