def test_scenario_1_s(self): # Smaller version of the corridor collisions scenario above # to facilitate DRL training scenario_1_mdp = OvercookedGridworld.from_layout_name( 'scenario1_s', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((4, 2), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_1_mdp, start_state_fn=lambda: start_state) trajectory, time_taken_hr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) env.reset() print("\n" * 5) print("-" * 50) a0 = CoupledPlanningAgent(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) trajectory, time_taken_rr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) print("H+R time taken: ", time_taken_hr) print("R+R time taken: ", time_taken_rr) self.assertGreater(time_taken_hr, time_taken_rr)
def get_bc_agent_from_model(model, bc_params, no_waits=False): mdp = OvercookedGridworld.from_layout_name(**bc_params["mdp_params"]) mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=False) def encoded_state_policy(observations, include_waits=True, stochastic=False): action_probs_n = model.action_probability(observations) if not include_waits: action_probs = ImitationAgentFromPolicy.remove_indices_and_renormalize( action_probs_n, [Action.ACTION_TO_INDEX[Direction.STAY]]) if stochastic: return [np.random.choice(len(action_probs[i]), p=action_probs[i]) for i in range(len(action_probs))] return action_probs_n def state_policy(mdp_states, agent_indices, include_waits, stochastic=False): # encode_fn = lambda s: mdp.preprocess_observation(s) encode_fn = lambda s: mdp.featurize_state(s, mlp) obs = [] for agent_idx, s in zip(agent_indices, mdp_states): ob = encode_fn(s)[agent_idx] obs.append(ob) obs = np.array(obs) action_probs = encoded_state_policy(obs, include_waits, stochastic) return action_probs return ImitationAgentFromPolicy(state_policy, encoded_state_policy, no_waits=no_waits, mlp=mlp)
def test_scenario_1(self): # Myopic corridor collision # # X X X X X O X D X X X X X # X ↓Ho X X # X X X X X X X X ↓R X # X X # X S X X X X X X X X P P X # # H on left with onion, further away to the tunnel entrance than R. # Optimal planner tells R to go first and that H will wait # for R to pass. H however, starts going through the tunnel # and they get stuck. The H plan is a bit extreme (it would probably # realize that it should retrace it's steps at some point) scenario_1_mdp = OvercookedGridworld.from_layout_name( 'small_corridor', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((10, 2), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_1_mdp, start_state_fn=lambda: start_state) env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
def setUp(self): self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room") self.mlp = MediumLevelPlanner.from_pickle_or_compute( self.base_mdp, NO_COUNTERS_PARAMS, force_compute=True) self.env = OvercookedEnv(self.base_mdp, **DEFAULT_ENV_PARAMS) self.rnd_agent_pair = AgentPair(GreedyHumanModel(self.mlp), GreedyHumanModel(self.mlp)) np.random.seed(0)
def mlp(self): assert not self.variable_mdp, "Variable mdp is not currently supported for planning" if self._mlp is None: if self.debug: print("Computing Planner") self._mlp = MediumLevelPlanner.from_pickle_or_compute( self.env.mdp, self.mlp_params, force_compute=self.force_compute) return self._mlp
def init_gym_env(bc_params): env_setup_params = copy.deepcopy(bc_params) del env_setup_params["data_params"] # Not necessary for setting up env mdp = OvercookedGridworld.from_layout_name(**bc_params["mdp_params"]) env = OvercookedEnv(mdp, **bc_params["env_params"]) gym_env = gym.make("Overcooked-v0") mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=False) gym_env.custom_init(env, featurize_fn=lambda x: mdp.featurize_state(x, mlp)) return gym_env
def joint_state_trajectory_to_single(trajectories, joint_traj_data, traj_metadata, player_indices_to_convert=None, processed=True): """ Take a joint trajectory and split it into two single-agent trajectories, adding data to the `trajectories` dictionary player_indices_to_convert: which player indexes' trajs we should return """ from overcooked_ai_py.planning.planners import MediumLevelPlanner, NO_COUNTERS_PARAMS mdp = traj_metadata['mdp'] mlp = MediumLevelPlanner.from_pickle_or_compute( mdp=mdp, mlp_params=NO_COUNTERS_PARAMS, force_compute=False ) assert len(joint_traj_data['ep_observations']) == 1, "This method only takes in one trajectory" states, joint_actions = joint_traj_data['ep_observations'][0], joint_traj_data['ep_actions'][0] rewards, length = joint_traj_data['ep_rewards'][0], joint_traj_data['ep_lengths'][0] # Getting trajectory for each agent for agent_idx in player_indices_to_convert: ep_obs, ep_acts, ep_dones = [], [], [] for i in range(len(states)): state, action = states[i], joint_actions[i][agent_idx] if processed: # Pre-processing (default is state featurization) action = np.array([Action.ACTION_TO_INDEX[action]]).astype(int) # NOTE: Could parallelize a bit more if slow # state = mdp.preprocess_observation(state)[agent_idx] state = mdp.featurize_state(state, mlp)[agent_idx] ep_obs.append(state) ep_acts.append(action) ep_dones.append(False) if len(ep_obs) == 0: worker_id, layout_name = traj_metadata['workerid_num'], mdp.layout_name print("{} on layout {} had an empty traj?. Excluding from dataset.".format(worker_id, layout_name)) ep_dones[-1] = True trajectories["ep_observations"].append(ep_obs) trajectories["ep_actions"].append(ep_acts) trajectories["ep_dones"].append(ep_dones) trajectories["ep_rewards"].append(rewards) trajectories["ep_returns"].append(sum(rewards)) trajectories["ep_lengths"].append(length) trajectories["ep_agent_idxs"].append(agent_idx) trajectories["mdp_params"].append(mdp.mdp_params) trajectories["env_params"].append({})
def ppo_run(params): create_dir_if_not_exists(params["SAVE_DIR"]) save_pickle(params, params["SAVE_DIR"] + "config") ############# # PPO SETUP # ############# train_infos = [] for seed in params["SEEDS"]: reset_tf() set_global_seed(seed) curr_seed_dir = params["SAVE_DIR"] + "seed" + str(seed) + "/" create_dir_if_not_exists(curr_seed_dir) save_pickle(params, curr_seed_dir + "config") print("Creating env with params", params) # Configure mdp mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"]) env = OvercookedEnv(mdp, **params["env_params"]) mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=True) # Configure gym env gym_env = get_vectorized_gym_env( env, 'Overcooked-v0', featurize_fn=lambda x: mdp.lossless_state_encoding(x), **params ) gym_env.self_play_randomization = 0 if params["SELF_PLAY_HORIZON"] is None else 1 gym_env.trajectory_sp = params["TRAJECTORY_SELF_PLAY"] gym_env.update_reward_shaping_param(1 if params["mdp_params"]["rew_shaping_params"] != 0 else 0) configure_other_agent(params, gym_env, mlp, mdp) # Create model with tf.device('/device:GPU:{}'.format(params["GPU_ID"])): model = create_model(gym_env, "ppo_agent", **params) # Train model params["CURR_SEED"] = seed train_info = update_model(gym_env, model, **params) # Save model save_ppo_model(model, curr_seed_dir + model.agent_name) print("Saved training info at", curr_seed_dir + "training_info") save_pickle(train_info, curr_seed_dir + "training_info") train_infos.append(train_info) return train_infos
def test_two_coupled_agents_coupled_pair(self): mlp_simple = MediumLevelPlanner.from_pickle_or_compute( simple_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) cp_agent = CoupledPlanningAgent(mlp_simple) agent_pair = CoupledPlanningPair(cp_agent) start_state = OvercookedState([P( (2, 2), n), P((2, 1), n)], {}, order_list=['any']) env = OvercookedEnv(simple_mdp, start_state_fn=lambda: start_state) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) end_state = trajectory[-1][0] self.assertEqual(end_state.order_list, [])
def test_two_greedy_human_open_map(self): scenario_2_mdp = OvercookedGridworld.from_layout_name( 'scenario2', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = GreedyHumanModel(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState([P( (8, 1), s), P((1, 1), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_2_mdp, start_state_fn=lambda: start_state, horizon=100) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) end_state = trajectory[-1][0] self.assertEqual(len(end_state.order_list), 0)
def setUp(self): self.mlp_large = MediumLevelPlanner.from_pickle_or_compute( large_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute_large)
simple_mdp = OvercookedGridworld.from_layout_name('simple_tomato', start_order_list=['any'], cook_time=5) base_params = { 'start_orientations': False, 'wait_allowed': False, 'counter_goals': simple_mdp.terrain_pos_dict['X'], 'counter_drop': simple_mdp.terrain_pos_dict['X'][1:2], 'counter_pickup': [], 'same_motion_goals': True } action_manger_filename = "simple_1_am.pkl" ml_planner_simple = MediumLevelPlanner.from_pickle_or_compute( simple_mdp, mlp_params=base_params, custom_filename=action_manger_filename, force_compute=force_compute) ml_planner_simple.env = OvercookedEnv(simple_mdp) base_params_start_or = { 'start_orientations': True, 'wait_allowed': False, 'counter_goals': simple_mdp.terrain_pos_dict['X'], 'counter_drop': [], 'counter_pickup': [], 'same_motion_goals': False } action_manger_filename = "simple_2_am.pkl" or_ml_planner_simple = MediumLevelPlanner.from_pickle_or_compute( simple_mdp,