def test_can_stack_frames_singleagent_env(): num_stack = 3 frame_stack = partial(FrameStack, num_stack=num_stack) pendulum_task = generate_task('Pendulum-v0') stack_pendulum_task = generate_task('Pendulum-v0', wrappers=[frame_stack]) assert stack_pendulum_task.observation_dim == ( num_stack, *pendulum_task.observation_dim)
def test_can_stack_frames_sequential_multiagent_env(): num_stack = 4 frame_stack = partial(FrameStack, num_stack=num_stack) connect_4_task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) stack_connect_4_task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION, wrappers=[frame_stack]) assert stack_connect_4_task.observation_dim == ( num_stack, *connect_4_task.observation_dim) num_envs = 3 vector_env = RegymAsyncVectorEnv(stack_connect_4_task.name, num_envs=num_envs, wrappers=[frame_stack]) actual_obs = vector_env.reset() # Standard Connect4 dimensions is (3, 7, 6) # NOTE: Think of board as being sideways (chips fall right-to-left) single_env_initial_observation = np.array([[[1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1.]], [[0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0.]]]) # We extend by number of stacked frames # So that per environment observation shape is (num_stacks, 3, 7, 6) stacked_single_env_initial_observation = np.array( [single_env_initial_observation for _ in range(num_stack)]) # We extend by number of environments # So that each agent receives observation of shape (num_envs, num_stack, 3, 7, 6) expected_player_obs = np.array( [stacked_single_env_initial_observation for _ in range(num_envs)]) num_agents = 2 for i in range(num_agents): np.testing.assert_array_equal(expected_player_obs, actual_obs[i])
def generate_evaluation_matrix(cool_game_params, benchmarking_episodes, mcts_budget): # 0: SawBot 1: TorchBot 2: NailBot import gym_cool_game saw_vs_torch_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=1, **cool_game_params) saw_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=2, **cool_game_params) torch_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2, **cool_game_params) mcts_config = { 'budget': mcts_budget, 'rollout_budget': 1000, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 4 # Might need to tweak this? } mcts_agent = build_MCTS_Agent(saw_vs_torch_task, mcts_config, agent_name='MCTS agent') saw_vs_torch = compute_matchup_winrates(mcts_agent, saw_vs_torch_task, 'Saw vs Torch', benchmarking_episodes, mcts_budget) saw_vs_nail = compute_matchup_winrates(mcts_agent, saw_vs_nail_task, 'Saw vs Nail', benchmarking_episodes, mcts_budget) torch_vs_nail = compute_matchup_winrates(mcts_agent, torch_vs_nail_task, 'Torch vs Nail', benchmarking_episodes, mcts_budget) bench_msg = f'episodes={benchmarking_episodes} MCTS_budget={mcts_budget}' winrates_msg = f'winrates=saw:[{saw_vs_torch}, {saw_vs_nail}] nail:[{torch_vs_nail}]' logger.info(bench_msg) logger.info(winrates_msg) logger.info(f'params={cool_game_params}') wandb.log({ 'Winrate_Saw_vs_Torch': saw_vs_torch, 'Winrate_Saw_vs_Nail': saw_vs_nail, 'Winrate_Torch_vs_Nail': torch_vs_nail }) return np.array([[0., saw_vs_torch, saw_vs_nail], [1. - saw_vs_torch, 0., torch_vs_nail], [1. - saw_vs_nail, 1. - torch_vs_nail, 0.]])
def generate_evaluation_matrix(cool_game_params, benchmarking_episodes, mcts_budget, logger: logging.Logger): # 0: SawBot 1: TorchBot 2: NailBot import gym_cool_game saw_vs_torch_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=1, **cool_game_params) saw_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=2, **cool_game_params) torch_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2, **cool_game_params) mcts_config = {'budget': mcts_budget, 'rollout_budget': 10} mcts_agent = build_MCTS_Agent(saw_vs_torch_task, mcts_config, agent_name='MCTS agent') saw_vs_torch = compute_matchup_winrates(mcts_agent, saw_vs_torch_task, 'Saw vs Torch', benchmarking_episodes, mcts_budget, logger) saw_vs_nail = compute_matchup_winrates(mcts_agent, saw_vs_nail_task, 'Saw vs Nail', benchmarking_episodes, mcts_budget, logger) torch_vs_nail = compute_matchup_winrates(mcts_agent, torch_vs_nail_task, 'Torch vs Nail', benchmarking_episodes, mcts_budget, logger) bench_msg = f'episodes={benchmarking_episodes} MCTS_budget={mcts_budget}' winrates_msg = f'winrates=saw:[{saw_vs_torch}, {saw_vs_nail}] nail:[{torch_vs_nail}]' logger.info(bench_msg) logger.info(winrates_msg) logger.info(f'params={cool_game_params}') return np.array([[0., saw_vs_torch, saw_vs_nail], [1. - saw_vs_torch, 0., torch_vs_nail], [1. - saw_vs_nail, 1. - torch_vs_nail, 0.]])
def RandomWalkTask(): from gym.envs.registration import register register( id='RandomWalk-v0', entry_point='regym.tests.rl_algorithms.random_walk_env:RandomWalkEnv') return generate_task('RandomWalk-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION)
def main(population: List['Agent'], logger, num_stack: int): initial_mcts_config = { 'budget': 20, 'rollout_budget': 100, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1.41, 'use_dirichlet': False, 'dirichlet_alpha': None } task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION, wrappers=create_wrapper(num_stack=num_stack)) strength_estimation_df = pd.DataFrame( columns=('test_agent_id', 'mcts_budget', 'winrate_pos_0', 'winrate_pos_1', 'avg_winrate')) for agent in population: logger.info( f'Benchmarking agent with {agent.algorithm.num_updates} number of updates and {agent.finished_episodes} finished episodes' ) agent_strength, agent_specific_strength_estimation_df = estimate_agent_strength( agent, task, 0.5, initial_mcts_config, logger) strength_estimation_df = strength_estimation_df.append( agent_specific_strength_estimation_df, ignore_index=True) logger.info(f'Agent strength: {agent_strength} (MCTS budget)') strength_estimation_df.to_csv('mcts_equivalent_strenght_estimation_df.csv')
def main(path: str, name: str): task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) #sort_fn = lambda x: int(x.split('_')[-1][:-3]) # ExIt sort_fn = lambda x: int(x.split('/')[-1].split('_')[0] ) # PPO test training sorted_population = load_population_from_path(path=path, sort_fn=sort_fn) for agent in sorted_population: print(agent.algorithm.num_updates) agent.requires_environment_model = False agent.training = False winrate_matrix = compute_winrate_matrix_metagame( population=sorted_population, episodes_per_matchup=1000, task=task) maxent_nash, nash_averaging = compute_nash_averaging( winrate_matrix, perform_logodds_transformation=True) winrate_matrix = np.array(winrate_matrix) print( 'Saving winrate_matrix, max-entropy Nash equilibrium for game defined by winrate matrix and Nash averaging' ) np.savetxt(f'{name}_winrate_matrix.csv', winrate_matrix, delimiter=', ') np.savetxt(f'{name}_maxent_nash.csv', maxent_nash, delimiter=', ') np.savetxt(f'{name}_nash_averaging.csv', maxent_nash, delimiter=', ') ax = plot_winrate_matrix(winrate_matrix) plt.show()
def main(): task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2) random_r1 = build_Random_Agent(task, {}, agent_name='random') random_r2 = deepcopy(random_r1) mcts_config = { 'budget': 10, 'rollout_budget': 1000, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 4 # Might need to tweak this? } mcts_r1 = build_MCTS_Agent(task, mcts_config, agent_name='P1: MCTS') mcts_r2 = build_MCTS_Agent(task, mcts_config, agent_name='P2: MCTS') human_r1 = HumanAgent(task.action_dim, name='P1') human_r2 = HumanAgent(task.action_dim, name='P2') # t = task.run_episode([mcts_r1, mcts_r2], training=False, render_mode='rgb', save_gif=True) t = task.run_episode([mcts_r1, mcts_r2], training=False) print(t)
def initialize_experiment(experiment_config, agents_config, self_play_configs): env_name, requested_env_type = experiment_config['environment'] task = generate_task(env_name, EnvType(requested_env_type)) sp_schemes = initialize_training_schemes(self_play_configs, task) agents = initialize_agents(task, agents_config) return task, sp_schemes, agents
def main(population: List, name: str, num_stack: int): #task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION, wrappers=create_wrapper(num_stack=num_stack)) winrate_matrix = compute_winrate_matrix_metagame( population=sorted_population, episodes_per_matchup=200, num_envs=-1, task=task, is_game_symmetrical=False, show_progress=True) maxent_nash, nash_averaging = compute_nash_averaging( winrate_matrix, perform_logodds_transformation=True) winrate_matrix = np.array(winrate_matrix) print( 'Saving winrate_matrix, max-entropy Nash equilibrium for game defined by winrate matrix and Nash averaging' ) np.savetxt(f'{name}/winrate_matrix.csv', winrate_matrix, delimiter=', ') np.savetxt(f'{name}/maxent_nash.csv', maxent_nash, delimiter=', ') np.savetxt(f'{name}/nash_averaging.csv', maxent_nash, delimiter=', ') ax = plot_winrate_matrix(winrate_matrix) plt.show()
def generate_evaluation_matrix(cool_game_params, logger): # 0: SawBot 1: TorchBot 2: NailBot benchmarking_episodes = 1 mcts_budget = 1 saw_vs_torch_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=1, **cool_game_params) saw_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=2, **cool_game_params) torch_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2, **cool_game_params) mcts_config = {'budget': mcts_budget} mcts_agent = build_MCTS_Agent(saw_vs_torch_task, mcts_config, agent_name='MCTS agent') saw_winrates = benchmark_agents_on_tasks( tasks=[saw_vs_torch_task, saw_vs_nail_task], agents=[mcts_agent], populate_all_agents=True, num_episodes=benchmarking_episodes) nail_winrate = benchmark_agents_on_tasks( tasks=[torch_vs_nail_task], agents=[mcts_agent], populate_all_agents=True, num_episodes=benchmarking_episodes) bench_msg = f'episodes={benchmarking_episodes} MCTS_budget={mcts_budget}' winrates_msg = f'winrates=saw:{saw_winrates} nail:{nail_winrate}' logger.info(bench_msg) logger.info(winrates_msg) logger.info(f'params={cool_game_params}') return np.array([[0., saw_winrates[0], saw_winrates[1]], [-saw_winrates[0], 0., nail_winrate[0]], [-saw_winrates[0], -nail_winrate[0], 0.]])
def FixedLengthDummyTask(): from regym.environments import generate_task, EnvType from gym.envs.registration import register register( id='FixedLengthDummy-v0', entry_point= 'regym.tests.test_utils.fixed_length_dummy_env:FixedLengthDummyEnv') return generate_task('FixedLengthDummy-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION)
def main(): task = generate_task("Yaniv-v0", EnvType.MULTIAGENT_SEQUENTIAL_ACTION) # random_r1 = build_Random_Agent(task, {}, agent_name="random") ppo = build_PPO_Agent(task, hyperparams, "ppo") traj = task.run_episode( [ppo, ppo], training=True, ) print(traj)
def test_can_parse_connect4_task(): import gym_connect4 task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) expected_observation_dim = (3, 7, 6) expected_observation_size = 126 expected_observation_type = 'Continuous' assert expected_observation_dim == task.observation_dim assert expected_observation_type == task.observation_type assert expected_observation_size == task.observation_size
def test_can_pass_kwargs_to_env(): from gym.envs.registration import register register( id='DummyEnv-v0', entry_point='regym.tests.environments.params_test_env:ParamsTestEnv') params = {'param1': 1, 'param2': 2, 'param3': 3} task = generate_task('DummyEnv-v0', **params) assert task.env.param1 == 1 assert task.env.param2 == 2 assert task.env.param3 == 3
def initialize_experiment(experiment_config, agents_config, self_play_configs): env, env_type = experiment_config['environment'] task = generate_task(env, EnvType(env_type)) sp_schemes = initialize_training_schemes(self_play_configs, task) agents = initialize_agents(task, agents_config) seeds = list(map(int, experiment_config['seeds'])) number_of_runs = experiment_config['number_of_runs'] if len(seeds) < number_of_runs: print(f'Number of random seeds does not match "number of runs" config value. Genereting new seeds"') seeds = np.random.randint(0, 10000, number_of_runs).tolist() return task, sp_schemes, agents, seeds
def test_singleagent_tasks_run_faster_on_parallel(env_name): task = generate_task(env_name, EnvType.SINGLE_AGENT) random_agent = build_Random_Agent(task, {}, 'Test-Random') num_episodes = 50 num_envs = 1 start = time.time() trajectories = task.run_episodes([random_agent], num_episodes=num_episodes, num_envs=num_envs, training=False) total_single = time.time() - start start = time.time() num_envs = multiprocessing.cpu_count() trajectories = task.run_episodes([random_agent], num_episodes=num_episodes, num_envs=num_envs, training=False) total_multiple = time.time() - start assert total_multiple < total_single
def main(): task = generate_task('RockPaperScissors-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION) print('Initializing agent') agent = initialize_agent(task) print('Computing SP-induced trajectories') training_trajectories = compute_sp_training_trajectories( task=task, agent=agent, sp_scheme=NaiveSelfPlay, ) print('Computing basis trajectories') basis_trajectories = compute_basis_trajectories(task) print('Merging trajectories') import ipdb ipdb.set_trace() all_trajectories = merge_basis_and_trained_trajectories( basis_trajectories, training_trajectories) print('Number basis trajectories:', len(basis_trajectories['trajectory'])) # Compute trajectories from training agent ts = copy.deepcopy(all_trajectories['trajectory']) print( f'Number trajectories: {len(ts)} // Steps per trajectories: {len(ts[0])}' ) actions = [ # We get the last observation of the first agent, # This contains the last joint action by both agents. [ step.observation[0][-1] for idx, step in enumerate(t) if idx < 10 and idx > 0 ] for t in ts ] embeddings = generate_t_sne_embedding(actions, all_trajectories)
def play_against_fixed_agent(agent, fixed_agent_action, agent_position, max_reward, total_episodes=2000): ''' Test used to make sure that agent is 'learning' by learning a best response against an agent that only plays rock in rock paper scissors. i.e from random, learns to play only (or mostly) paper ''' from play_against_fixed_opponent import learn_against_fix_opponent class FixedAgent(Agent): def __init__(self, action): super(FixedAgent, self).__init__(name=f'FixedAction: {action}') self.action = action def take_action(self, *args): return self.action def handle_experience(self, *args): pass def clone(self, *args): pass fixed_opponent = FixedAgent(fixed_agent_action) kuhn_task = generate_task('KuhnPoker-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) assert agent.training learn_against_fix_opponent(agent, fixed_opponent=fixed_opponent, agent_position=agent_position, task=kuhn_task, total_episodes=total_episodes, training_percentage=0.9, reward_tolerance=1., maximum_average_reward=max_reward, evaluation_method='last')
def test_multiagent_sequential_tasks_run_faster_on_parallel(env_name): task = generate_task(env_name, EnvType.MULTIAGENT_SEQUENTIAL_ACTION) random_agent = build_Random_Agent(task, {}, 'Test-Random') start = time.time() num_episodes = 100 num_envs = 1 _ = task.run_episodes([random_agent, random_agent], num_episodes=num_episodes, num_envs=num_envs, training=False) total_single = time.time() - start start = time.time() num_envs = multiprocessing.cpu_count() _ = task.run_episodes([random_agent, random_agent], num_episodes=num_episodes, num_envs=num_envs, training=False) total_multiple = time.time() - start print('Parallel: ', total_multiple, 'Sequential: ', total_single, 'Diff: ', total_single - total_multiple) assert total_multiple < total_single
def main(path: str): initial_mcts_config = { 'budget': 10, 'rollout_budget': 100, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1.41, 'use_dirichlet': False, 'dirichlet_alpha': None } task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) for agent in load_population(path): logger.info( f'Benchmarking agent with {agent.algorithm.num_updates} number of updates' ) nn_agent = build_NeuralNet_Agent( task, { 'neural_net': agent.algorithm.model, 'pre_processing_fn': batch_vector_observation }, agent_name='NeuralNet') agent_strength = estimate_agent_strength(nn_agent, task, 0.5, initial_mcts_config) logger.info(f'Agent strength: {agent_strength} (MCTS budget)')
def play_against_fixed_agent(agent, fixed_agent_action, agent_position, max_reward, total_episodes=2000): ''' Test used to make sure that agent is 'learning' by learning a best response against an agent that only plays rock in rock paper scissors. i.e from random, learns to play only (or mostly) paper ''' kuhn_task = generate_task('KuhnPoker-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) fixed_opponent = build_Deterministic_Agent(kuhn_task, {'action': fixed_agent_action}, f'Fixed action: {fixed_agent_action}') assert agent.training parallel_learn_against_fix_opponent( agent, fixed_opponent, task=kuhn_task, agent_position=agent_position, training_episodes=total_episodes, test_episodes=100, reward_tolerance=max_reward*0.1, # 10% off maximum benchmark_every_n_episodes=3000, # has to be larger than total_episodes maximum_average_reward=max_reward, evaluation_method='last', )
def test_multiagent_sequential_tasks_with_model_based_agents_run_faster_on_parallel( env_name): task = generate_task(env_name, EnvType.MULTIAGENT_SEQUENTIAL_ACTION) mcts_config = { 'budget': 10, 'rollout_budget': 100, 'use_dirichlet': False, 'dirichlet_alpha': 1, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1 } agent_vector = [ build_MCTS_Agent(task, mcts_config, 'Test-MCTS-Random') for _ in range(task.num_agents) ] start = time.time() num_episodes = 10 num_envs = 1 _ = task.run_episodes(agent_vector, num_episodes=num_episodes, num_envs=num_envs, training=False) total_single = time.time() - start print('Sequential: ', total_single) start = time.time() num_envs = multiprocessing.cpu_count() _ = task.run_episodes(agent_vector, num_episodes=num_episodes, num_envs=num_envs, training=False) total_multiple = time.time() - start print('Parallel: ', total_multiple, 'Sequential: ', total_single, 'Diff: ', total_single - total_multiple) assert total_multiple < total_single
def initialize_experiment(experiment_config, agents_config): env_name, requested_env_type = experiment_config['environment'] task = generate_task(env_name, EnvType(requested_env_type)) agents = initialize_agents(task, agents_config) return task, agents
os.mkdir(args.name) ### To refactor at some point #sort_fn = lambda x: int(x.split('_')[-1][:-3]) # ExIt sort_fn = lambda x: int(x.split('/')[-1].split('_')[0]) # PPO test training sorted_population = load_population_from_path(path=args.path, sort_fn=sort_fn) sorted_population.sort(key=lambda agent: agent.finished_episodes) for agent in sorted_population: agent.requires_environment_model = False agent.training = False ### # Taken from MCTS equivalent strength benchmarking mcts_budgets = [29, 42, 42, 38, 45, 56, 48, 49, 51, 42, 53, 46, 35, 49, 49, 42, 45, 40, 45, 42, 47, 38, 42, 47, 45, 37, 42, 35, 39, 25, 38, 34, 33, 38, 40] mcts_population = [] for budget in mcts_budgets: initial_mcts_config = {'budget': budget, 'rollout_budget': 100, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1.41, 'use_dirichlet': False, 'dirichlet_alpha': None} mcts_population.append( build_MCTS_Agent(generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION), initial_mcts_config, agent_name=f'MCTS:{budget}') ) main(population=sorted_population+mcts_population, name=args.name)
def create_task_from_config(environment_config): wrappers = create_wrappers(environment_config) task = generate_task(environment_config['name'], EnvType(environment_config['env_type']), wrappers=wrappers) return task
def pendulum_task(): return generate_task('Pendulum-v0')
def Connect4Task(): import gym_connect4 return generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION)
def RPSTask(): import gym_rock_paper_scissors return generate_task('RockPaperScissors-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION)
def run_parallel_task_with_random_agent(env_name, env_type, num_envs, num_episodes, model_based_agents=False): task = generate_task(env_name, env_type) # Random agens, either MCTS or random if model_based_agents: mcts_config = { 'budget': 1, 'rollout_budget': 0, 'use_dirichlet': False, 'dirichlet_alpha': 1, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1, 'expose_tree_in_predictions': True } agent_vector = [ build_MCTS_Agent(task, mcts_config, 'Test-MCTS-Random') for _ in range(task.num_agents) ] else: agent_vector = [ build_Random_Agent(task, {}, 'Test-Random') for _ in range(task.num_agents) ] # The number of environments is larger than number of # episodes because we want to test if we can generate # a specific number of trajectories regardless of the # Number of environments used to generate them trajectories = task.run_episodes(agent_vector, num_episodes=num_episodes, num_envs=num_envs, training=True, store_extra_information=True) import pdbr pdbr.set_trace() # We have the exact number of trajectories we asked for # The number of trajectories is lower-bounded by :param: num_episodes # But it is possible that multiple environments finish at the same time assert (len(trajectories) >= num_episodes) and (len(trajectories) <= (num_episodes + num_envs)) # All trajectories finish with a "done" flag assert all([t[-1].done for t in trajectories]) # All timesteps except for last one in all trajectories don't have "done" set for t in trajectories: assert all([not timestep.done for timestep in t[:-1]]) # ASSUMPTION: observation and succ_observation are numpy array if env_type == EnvType.SINGLE_AGENT: # Observation and succ_observation are the same # ASSUMPTION: observation and succ_observation are numpy array assert all([(ex_1.succ_observation == ex_2.observation).all() for t in trajectories for ex_1, ex_2 in zip(t, t[1:])]) else: # Observation and succ_observation are the same for all agents assert all([ (ex_1.succ_observation[a_i] == ex_2.observation[a_i]).all() for t in trajectories for ex_1, ex_2 in zip(t, t[1:]) for a_i in range(task.num_agents) ])