示例#1
0
    def test_random_layout(self):
        mdp_gen_params = {"prop_feats": (1, 1)}
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(**mdp_gen_params)
        env = OvercookedEnv(mdp=mdp_fn, **DEFAULT_ENV_PARAMS)
        start_terrain = env.mdp.terrain_mtx

        for _ in range(3):
            env.reset()
            print(env)
            curr_terrain = env.mdp.terrain_mtx
            self.assertFalse(np.array_equal(start_terrain, curr_terrain))

        mdp_gen_params = {
            "mdp_choices": ['cramped_room', 'asymmetric_advantages']
        }
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(**mdp_gen_params)
        env = OvercookedEnv(mdp=mdp_fn, **DEFAULT_ENV_PARAMS)

        layouts_seen = []
        for _ in range(10):
            layouts_seen.append(env.mdp.terrain_mtx)
            env.reset()
        all_same_layout = all([
            np.array_equal(env.mdp.terrain_mtx, terrain)
            for terrain in layouts_seen
        ])
        self.assertFalse(all_same_layout)
    def test_scenario_1_s(self):
        # Smaller version of the corridor collisions scenario above
        # to facilitate DRL training
        scenario_1_mdp = OvercookedGridworld.from_layout_name(
            'scenario1_s', start_order_list=['any'], cook_time=5)
        mlp = MediumLevelPlanner.from_pickle_or_compute(
            scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute)
        a0 = GreedyHumanModel(mlp)
        a1 = CoupledPlanningAgent(mlp)
        agent_pair = AgentPair(a0, a1)
        start_state = OvercookedState(
            [P((2, 1), s, Obj('onion', (2, 1))),
             P((4, 2), s)], {},
            order_list=['onion'])
        env = OvercookedEnv(scenario_1_mdp, start_state_fn=lambda: start_state)
        trajectory, time_taken_hr, _, _ = env.run_agents(
            agent_pair, include_final_state=True, display=DISPLAY)
        env.reset()

        print("\n" * 5)
        print("-" * 50)

        a0 = CoupledPlanningAgent(mlp)
        a1 = CoupledPlanningAgent(mlp)
        agent_pair = AgentPair(a0, a1)
        trajectory, time_taken_rr, _, _ = env.run_agents(
            agent_pair, include_final_state=True, display=DISPLAY)

        print("H+R time taken: ", time_taken_hr)
        print("R+R time taken: ", time_taken_rr)
        self.assertGreater(time_taken_hr, time_taken_rr)
示例#3
0
    def test_random_layout_feature_types(self):
        mandatory_features = {POT, DISH_DISPENSER, SERVING_LOC}
        optional_features = {ONION_DISPENSER, TOMATO_DISPENSER}
        optional_features_combinations = [{ONION_DISPENSER, TOMATO_DISPENSER}, {ONION_DISPENSER}, {TOMATO_DISPENSER}]

        for optional_features_combo in optional_features_combinations:
            left_out_optional_features = optional_features - optional_features_combo
            used_features = list(optional_features_combo | mandatory_features)
            mdp_gen_params = {"prop_feats": 0.9,
                            "feature_types": used_features,
                            "prop_empty": 0.1,
                            "inner_shape": (6, 5),
                            "display": False,
                            "start_all_orders" : [
                                { "ingredients" : ["onion", "onion", "onion"]}
                            ]}
            mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6, 5))
            env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)
            for _ in range(10):
                env.reset()
                curr_terrain = env.mdp.terrain_mtx
                terrain_features = set.union(*(set(line) for line in curr_terrain))
                self.assertTrue(all(elem in terrain_features for elem in used_features)) # all used_features are actually used
                if left_out_optional_features:
                    self.assertFalse(any(elem in terrain_features for elem in left_out_optional_features)) # all left_out optional_features are not used
示例#4
0
 def test_starting_obj_randomization(self):
     self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room")
     start_state_fn = self.base_mdp.get_random_start_state_fn(
         random_start_pos=False, rnd_obj_prob_thresh=0.8)
     env = OvercookedEnv(self.base_mdp, start_state_fn)
     start_state = env.state.all_objects_list
     for _ in range(3):
         env.reset()
         print(env)
         curr_terrain = env.state.all_objects_list
         self.assertFalse(np.array_equal(start_state, curr_terrain))
示例#5
0
 def test_starting_position_randomization(self):
     self.base_mdp = OvercookedGridworld.from_layout_name("simple")
     start_state_fn = self.base_mdp.get_random_start_state_fn(
         random_start_pos=True, rnd_obj_prob_thresh=0.0)
     env = OvercookedEnv(self.base_mdp, start_state_fn)
     start_state = env.state.players_pos_and_or
     for _ in range(3):
         env.reset()
         print(env)
         curr_terrain = env.state.players_pos_and_or
         self.assertFalse(np.array_equal(start_state, curr_terrain))
示例#6
0
    def test_random_layout(self):
        mdp_gen_params = {
            "inner_shape": (5, 4),
            "prop_empty": 0.8,
            "prop_feats": 0.2,
            "start_all_orders": [{
                "ingredients": ["onion", "onion", "onion"]
            }],
            "recipe_values": [20],
            "recipe_times": [20],
            "display": False
        }
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params,
                                                      outer_shape=(5, 4))
        env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)
        start_terrain = env.mdp.terrain_mtx

        for _ in range(3):
            env.reset()
            curr_terrain = env.mdp.terrain_mtx
            self.assertFalse(np.array_equal(start_terrain, curr_terrain))

        mdp_gen_params = {"layout_name": 'cramped_room'}
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params)
        env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)

        layouts_seen = []
        for _ in range(5):
            layouts_seen.append(env.mdp.terrain_mtx)
            env.reset()

        all_same_layout = all([
            np.array_equal(env.mdp.terrain_mtx, terrain)
            for terrain in layouts_seen
        ])
        self.assertTrue(all_same_layout)

        mdp_gen_params = {"layout_name": 'asymmetric_advantages'}
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params)
        env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)
        for _ in range(5):
            layouts_seen.append(env.mdp.terrain_mtx)
            env.reset()

        all_same_layout = all([
            np.array_equal(env.mdp.terrain_mtx, terrain)
            for terrain in layouts_seen
        ])
        self.assertFalse(all_same_layout)
示例#7
0
    def test_random_layout_generated_recipes(self):
        only_onions_recipes = [Recipe(["onion", "onion"]), Recipe(["onion", "onion", "onion"])]
        only_onions_dict_recipes = [r.to_dict() for r in only_onions_recipes]

        # checking if recipes are generated from mdp_params
        mdp_gen_params = {"generate_all_orders": {"n":2, "ingredients": ["onion"], "min_size":2, "max_size":3},
                        "prop_feats": 0.9,
                        "prop_empty": 0.1,
                        "inner_shape": (6, 5),
                        "display": False}
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6, 5))
        env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)
        for _ in range(10):
            env.reset()
            self.assertCountEqual(env.mdp.start_all_orders, only_onions_dict_recipes)
            self.assertEqual(len(env.mdp.start_bonus_orders), 0)
        
        # checking if bonus_orders is subset of all_orders even if not specified

        mdp_gen_params = {"generate_all_orders": {"n":2, "ingredients": ["onion"], "min_size":2, "max_size":3},
                        "generate_bonus_orders": {"n":1, "min_size":2, "max_size":3},
                        "prop_feats": 0.9,
                        "prop_empty": 0.1,
                        "inner_shape": (6, 5),
                        "display": False}
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6,5))
        env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)
        for _ in range(10):
            env.reset()
            self.assertCountEqual(env.mdp.start_all_orders, only_onions_dict_recipes)
            self.assertEqual(len(env.mdp.start_bonus_orders), 1)
            self.assertTrue(env.mdp.start_bonus_orders[0] in only_onions_dict_recipes)

        # checking if after reset there are new recipes generated
        mdp_gen_params = {"generate_all_orders": {"n":3, "min_size":2, "max_size":3},
                        "prop_feats": 0.9,
                        "prop_empty": 0.1,
                        "inner_shape": (6, 5),
                        "display": False,
                        "feature_types": [POT, DISH_DISPENSER, SERVING_LOC, ONION_DISPENSER, TOMATO_DISPENSER]
                        }
        mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6,5))
        env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS)
        generated_recipes_strings = set()
        for _ in range(20):
            env.reset()
            generated_recipes_strings |= {json.dumps(o, sort_keys=True) for o in env.mdp.start_all_orders}
        self.assertTrue(len(generated_recipes_strings) > 3)
示例#8
0
def learn(*,
          network,
          env,
          total_timesteps,
          early_stopping=False,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          scope='',
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''
    additional_params = network_kwargs["network_kwargs"]
    from baselines import logger

    # set_global_seeds(seed) We deal with seeds upstream

    if "LR_ANNEALING" in additional_params.keys():
        lr_reduction_factor = additional_params["LR_ANNEALING"]
        start_lr = lr
        lr = lambda prop: (start_lr / lr_reduction_factor) + (
            start_lr - (start_lr / lr_reduction_factor
                        )) * prop  # Anneals linearly from lr to lr/red factor

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    bestrew = 0
    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     scope=scope)

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.perf_counter()

    best_rew_per_step = 0

    run_info = defaultdict(list)
    nupdates = total_timesteps // nbatch
    print("TOT NUM UPDATES", nupdates)
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0, "Have {} total batch size and want {} minibatches, can't split evenly".format(
            nbatch, nminibatches)
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfos])
        eprewmean = safemean([epinfo['r'] for epinfo in epinfos])
        rew_per_step = eprewmean / eplenmean

        print("Curr learning rate {} \t Curr reward per step {}".format(
            lrnow, rew_per_step))

        if rew_per_step > best_rew_per_step and early_stopping:
            # Avoid updating best model at first iteration because the means might be a bit off because
            # of how the multithreaded batch simulation works
            best_rew_per_step = eprewmean / eplenmean
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            model.save(checkdir + ".temp_best_model")
            print("Saved model as best", best_rew_per_step, "avg rew/step")

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in tqdm.trange(0,
                                         nbatch,
                                         nbatch_train,
                                         desc="{}/{}".format(_, noptepochs)):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))

        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))

            eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf])
            ep_dense_rew_mean = safemean(
                [epinfo['ep_shaped_r'] for epinfo in epinfobuf])
            ep_sparse_rew_mean = safemean(
                [epinfo['ep_sparse_r'] for epinfo in epinfobuf])
            eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfobuf])
            run_info['eprewmean'].append(eprewmean)
            run_info['ep_dense_rew_mean'].append(ep_dense_rew_mean)
            run_info['ep_sparse_rew_mean'].append(ep_sparse_rew_mean)
            run_info['eplenmean'].append(eplenmean)
            run_info['explained_variance'].append(float(ev))

            logger.logkv(
                'true_eprew',
                safemean([epinfo['ep_sparse_r'] for epinfo in epinfobuf]))
            logger.logkv('eprewmean', eprewmean)
            logger.logkv('eplenmean', eplenmean)
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))

            time_elapsed = tnow - tfirststart
            logger.logkv('time_elapsed', time_elapsed)

            time_per_update = time_elapsed / update
            time_remaining = (nupdates - update) * time_per_update
            logger.logkv('time_remaining', time_remaining / 60)

            for (lossval, lossname) in zip(lossvals, model.loss_names):
                run_info[lossname].append(lossval)

                logger.logkv(lossname, lossval)

            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()

            # Update current logs
            if additional_params["RUN_TYPE"] in ["ppo", "joint_ppo"]:
                from overcooked_ai_py.utils import save_dict_to_file
                save_dict_to_file(run_info,
                                  additional_params["SAVE_DIR"] + "logs")

                # Linear annealing of reward shaping
                if additional_params["REW_SHAPING_HORIZON"] != 0:
                    # Piecewise linear annealing schedule
                    # annealing_thresh: until when we should stop doing 100% reward shaping
                    # annealing_horizon: when we should reach doing 0% reward shaping
                    annealing_horizon = additional_params[
                        "REW_SHAPING_HORIZON"]
                    annealing_thresh = 0

                    def fn(x):
                        if annealing_thresh != 0 and annealing_thresh - (
                                annealing_horizon / annealing_thresh) * x > 1:
                            return 1
                        else:
                            fn = lambda x: -1 * (x - annealing_thresh) * 1 / (
                                annealing_horizon - annealing_thresh) + 1
                            return max(fn(x), 0)

                    curr_timestep = update * nbatch
                    curr_reward_shaping = fn(curr_timestep)
                    env.update_reward_shaping_param(curr_reward_shaping)
                    print("Current reward shaping", curr_reward_shaping)

                sp_horizon = additional_params["SELF_PLAY_HORIZON"]

                # Save/overwrite best model if past a certain threshold
                if ep_sparse_rew_mean > bestrew and ep_sparse_rew_mean > additional_params[
                        "SAVE_BEST_THRESH"]:
                    # Don't save best model if still doing some self play and it's supposed to be a BC model
                    if additional_params[
                            "OTHER_AGENT_TYPE"][:
                                                2] == "bc" and sp_horizon != 0 and env.self_play_randomization > 0:
                        pass
                    else:
                        from human_aware_rl.ppo.ppo import save_ppo_model
                        print("BEST REW", ep_sparse_rew_mean,
                              "overwriting previous model with", bestrew)
                        save_ppo_model(
                            model, "{}seed{}/best".format(
                                additional_params["SAVE_DIR"],
                                additional_params["CURR_SEED"]))
                        bestrew = max(ep_sparse_rew_mean, bestrew)

                # If not sp run, and horizon is not None,
                # vary amount of self play over time, either with a sigmoidal feedback loop
                # or with a fixed piecewise linear schedule.
                if additional_params[
                        "OTHER_AGENT_TYPE"] != "sp" and sp_horizon is not None:
                    if type(sp_horizon) is not list:
                        # Sigmoid self-play schedule based on current performance (not recommended)
                        curr_reward = ep_sparse_rew_mean

                        rew_target = sp_horizon
                        shift = rew_target / 2
                        t = (1 / rew_target) * 10
                        fn = lambda x: -1 * (np.exp(t * (x - shift)) /
                                             (1 + np.exp(t * (x - shift)))) + 1

                        env.self_play_randomization = fn(curr_reward)
                        print("Current self-play randomization",
                              env.self_play_randomization)
                    else:
                        assert len(sp_horizon) == 2
                        # Piecewise linear self-play schedule

                        # self_play_thresh: when we should stop doing 100% self-play
                        # self_play_timeline: when we should reach doing 0% self-play
                        self_play_thresh, self_play_timeline = sp_horizon

                        def fn(x):
                            if self_play_thresh != 0 and self_play_timeline - (
                                    self_play_timeline /
                                    self_play_thresh) * x > 1:
                                return 1
                            else:
                                fn = lambda x: -1 * (
                                    x - self_play_thresh) * 1 / (
                                        self_play_timeline - self_play_thresh
                                    ) + 1
                                return max(fn(x), 0)

                        curr_timestep = update * nbatch
                        env.self_play_randomization = fn(curr_timestep)
                        print("Current self-play randomization",
                              env.self_play_randomization)

        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

        # Visualization of rollouts with actual other agent
        run_type = additional_params["RUN_TYPE"]
        if run_type in ["ppo", "joint_ppo"
                        ] and update % additional_params["VIZ_FREQUENCY"] == 0:
            from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv
            from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
            from overcooked_ai_py.agents.agent import AgentPair
            from overcooked_ai_py.agents.benchmarking import AgentEvaluator
            from human_aware_rl.baselines_utils import get_agent_from_model
            print(additional_params["SAVE_DIR"])

            mdp = OvercookedGridworld.from_layout_name(
                **additional_params["mdp_params"])
            overcooked_env = OvercookedEnv(mdp,
                                           **additional_params["env_params"])
            agent = get_agent_from_model(
                model,
                additional_params["sim_threads"],
                is_joint_action=(run_type == "joint_ppo"))
            agent.set_mdp(mdp)

            if run_type == "ppo":
                if additional_params["OTHER_AGENT_TYPE"] == 'sp':
                    agent_pair = AgentPair(agent,
                                           agent,
                                           allow_duplicate_agents=True)
                else:
                    print("PPO agent on index 0:")
                    env.other_agent.set_mdp(mdp)
                    agent_pair = AgentPair(agent, env.other_agent)
                    trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents(
                        agent_pair, display=True, display_until=100)
                    overcooked_env.reset()
                    agent_pair.reset()
                    print("tot rew", tot_rewards, "tot rew shaped",
                          tot_shaped_rewards)

                    print("PPO agent on index 1:")
                    agent_pair = AgentPair(env.other_agent, agent)

            else:
                agent_pair = AgentPair(agent)

            trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents(
                agent_pair, display=True, display_until=100)
            overcooked_env.reset()
            agent_pair.reset()
            print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards)
            print(additional_params["SAVE_DIR"])

    if nupdates > 0 and early_stopping:
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        print("Loaded best model", best_rew_per_step)
        model.load(checkdir + ".temp_best_model")
    return model, run_info
示例#9
0
def pbt_one_run(params, seed):
    # Iterating noptepochs over same batch data but shuffled differently
    # dividing each batch in `nminibatches` and doing a gradient step for each one
    create_dir_if_not_exists(params["SAVE_DIR"])
    save_dict_to_file(params, params["SAVE_DIR"] + "config")

    #######
    # pbt #
    #######

    mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"])
    overcooked_env = OvercookedEnv(mdp, **params["env_params"])

    print("Sample training environments:")
    for _ in range(5):
        overcooked_env.reset()
        print(overcooked_env)

    gym_env = get_vectorized_gym_env(
        overcooked_env,
        'Overcooked-v0',
        featurize_fn=lambda x: mdp.lossless_state_encoding(x),
        **params)
    gym_env.update_reward_shaping_param(1.0)  # Start reward shaping from 1

    annealer = LinearAnnealer(horizon=params["REW_SHAPING_HORIZON"])

    # ppo_expert_model = load_model("data/expert_agent/", "agent0", actual_agent_name="agent0")
    # pbt_expert_model = load_model("data/expert_agent/", "agent2", actual_agent_name="agent2")

    # AGENT POPULATION INITIALIZATION
    population_size = params["POPULATION_SIZE"]
    pbt_population = []
    pbt_agent_names = ['agent' + str(i) for i in range(population_size)]
    for agent_name in pbt_agent_names:
        agent = PBTAgent(agent_name, params, gym_env=gym_env)

        # overwrite_model(ppo_expert_model, model)

        pbt_population.append(agent)

    print("Initialized agent models")

    all_pairs = []
    for i in range(population_size):
        for j in range(i + 1, population_size):
            all_pairs.append((i, j))

    # MAIN LOOP

    def pbt_training():
        best_sparse_rew_avg = [-np.Inf] * population_size
        print(params['NUM_PBT_ITER'])
        for pbt_iter in range(1, params["NUM_PBT_ITER"] + 1):
            print("\n\n\nPBT ITERATION NUM {}".format(pbt_iter))

            # TRAINING PHASE
            assert params["ITER_PER_SELECTION"] == population_size**2
            pairs_to_train = list(
                itertools.product(range(population_size),
                                  range(population_size)))

            for sel_iter in range(params["ITER_PER_SELECTION"]):
                # Randomly select agents to be trained
                pair_idx = np.random.choice(len(pairs_to_train))
                idx0, idx1 = pairs_to_train.pop(pair_idx)
                pbt_agent0, pbt_agent1 = pbt_population[idx0], pbt_population[
                    idx1]

                # Training agent 1, leaving agent 0 fixed
                print(
                    "Training agent {} ({}) with agent {} ({}) fixed (pbt #{}/{}, sel #{}/{})"
                    .format(idx1, pbt_agent1.num_ppo_runs, idx0,
                            pbt_agent0.num_ppo_runs, pbt_iter,
                            params["NUM_PBT_ITER"], sel_iter,
                            params["ITER_PER_SELECTION"]))

                agent_env_steps = pbt_agent1.num_ppo_runs * params[
                    "PPO_RUN_TOT_TIMESTEPS"]
                reward_shaping_param = annealer.param_value(agent_env_steps)
                print("Current reward shaping:", reward_shaping_param,
                      "\t Save_dir", params["SAVE_DIR"])
                pbt_agent1.logs["reward_shaping"].append(reward_shaping_param)
                gym_env.update_reward_shaping_param(reward_shaping_param)

                gym_env.other_agent = pbt_agent0.get_agent()
                pbt_agent1.update(gym_env)

                save_folder = params["SAVE_DIR"] + pbt_agent1.agent_name + '/'
                pbt_agent1.save(save_folder)

                agent_pair = AgentPair(pbt_agent0.get_agent(),
                                       pbt_agent1.get_agent())
                overcooked_env.get_rollouts(
                    agent_pair,
                    num_games=1,
                    final_state=True,
                    reward_shaping=reward_shaping_param)

            assert len(pairs_to_train) == 0

            # SELECTION PHASE
            # Overwrite worst agent with best model's agent, according to
            # a proxy for generalization performance (avg dense reward across population)
            print("\nSELECTION PHASE\n")

            # Dictionary with average returns for each agent when matched with each other agent
            avg_ep_returns_dict = defaultdict(list)
            avg_ep_returns_sparse_dict = defaultdict(list)

            for i, pbt_agent in enumerate(pbt_population):
                # Saving each agent model at the end of the pbt iteration
                pbt_agent.update_pbt_iter_logs()

                if pbt_iter == params["NUM_PBT_ITER"]:
                    save_folder = params[
                        "SAVE_DIR"] + pbt_agent.agent_name + '/'
                    pbt_agent.save_predictor(save_folder +
                                             "pbt_iter{}/".format(pbt_iter))
                    pbt_agent.save(save_folder +
                                   "pbt_iter{}/".format(pbt_iter))

                for j in range(i, population_size):
                    # Pairs each agent with all other agents including itself in assessing generalization performance
                    print("Evaluating agent {} and {}".format(i, j))
                    pbt_agent_other = pbt_population[j]

                    agent_pair = AgentPair(pbt_agent.get_agent(),
                                           pbt_agent_other.get_agent())
                    trajs = overcooked_env.get_rollouts(
                        agent_pair,
                        params["NUM_SELECTION_GAMES"],
                        reward_shaping=reward_shaping_param)
                    dense_rews, sparse_rews, lens = trajs["ep_returns"], trajs[
                        "ep_returns_sparse"], trajs["ep_lengths"]
                    rew_per_step = np.sum(dense_rews) / np.sum(lens)
                    avg_ep_returns_dict[i].append(rew_per_step)
                    avg_ep_returns_sparse_dict[i].append(sparse_rews)
                    if j != i:
                        avg_ep_returns_dict[j].append(rew_per_step)
                        avg_ep_returns_sparse_dict[j].append(sparse_rews)

            print("AVG ep rewards dict", avg_ep_returns_dict)

            for i, pbt_agent in enumerate(pbt_population):
                pbt_agent.update_avg_rew_per_step_logs(avg_ep_returns_dict[i])

                avg_sparse_rew = np.mean(avg_ep_returns_sparse_dict[i])
                if avg_sparse_rew > best_sparse_rew_avg[i]:
                    best_sparse_rew_avg[i] = avg_sparse_rew
                    agent_name = pbt_agent.agent_name
                    print(
                        "New best avg sparse rews {} for agent {}, saving...".
                        format(best_sparse_rew_avg, agent_name))
                    best_save_folder = params[
                        "SAVE_DIR"] + agent_name + '/best/'
                    delete_dir_if_exists(best_save_folder, verbose=True)
                    pbt_agent.save_predictor(best_save_folder)
                    pbt_agent.save(best_save_folder)

            # Get best and worst agents when averageing rew per step across all agents
            best_agent_idx = max(
                avg_ep_returns_dict,
                key=lambda key: np.mean(avg_ep_returns_dict[key]))
            worst_agent_idx = min(
                avg_ep_returns_dict,
                key=lambda key: np.mean(avg_ep_returns_dict[key]))

            # MUTATION PHASE

            pbt_population[worst_agent_idx].explore_from(
                pbt_population[best_agent_idx])
            print(
                "Overwrote worst model {} ({} rew) with best model {} ({} rew)"
                .format(worst_agent_idx, avg_ep_returns_dict[worst_agent_idx],
                        best_agent_idx, avg_ep_returns_dict[best_agent_idx]))

            best_agent = pbt_population[best_agent_idx].get_agent()
            best_agent_copy = pbt_population[best_agent_idx].get_agent()
            agent_pair = AgentPair(best_agent, best_agent_copy)
            overcooked_env.get_rollouts(agent_pair,
                                        num_games=1,
                                        final_state=True,
                                        display=True,
                                        reward_shaping=reward_shaping_param)

    pbt_training()
    reset_tf()
    print(params["SAVE_DIR"])
示例#10
0
class AgentEvaluator(object):
    """
    Class used to get rollouts and evaluate performance of various types of agents.
    """
    def __init__(self,
                 mdp_params,
                 env_params={},
                 mdp_fn_params=None,
                 force_compute=False,
                 mlp_params=NO_COUNTERS_PARAMS,
                 debug=False):
        """
        mdp_params (dict): params for creation of an OvercookedGridworld instance through the `from_layout_name` method
        env_params (dict): params for creation of an OvercookedEnv
        mdp_fn_params (dict): params to setup random MDP generation
        force_compute (bool): whether should re-compute MediumLevelPlanner although matching file is found
        mlp_params (dict): params for MediumLevelPlanner
        """
        assert type(mdp_params) is dict, "mdp_params must be a dictionary"

        if mdp_fn_params is None:
            self.variable_mdp = False
            self.mdp_fn = lambda: OvercookedGridworld.from_layout_name(
                **mdp_params)
        else:
            self.variable_mdp = True
            self.mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(
                mdp_params, **mdp_fn_params)

        self.env = OvercookedEnv(self.mdp_fn, **env_params)
        self.force_compute = force_compute
        self.debug = debug
        self.mlp_params = mlp_params
        self._mlp = None

    @property
    def mlp(self):
        assert not self.variable_mdp, "Variable mdp is not currently supported for planning"
        if self._mlp is None:
            if self.debug: print("Computing Planner")
            self._mlp = MediumLevelPlanner.from_pickle_or_compute(
                self.env.mdp,
                self.mlp_params,
                force_compute=self.force_compute)
        return self._mlp

    def evaluate_random_pair(self, interact=True, display=False):
        agent_pair = AgentPair(RandomAgent(interact=interact),
                               RandomAgent(interact=interact))
        return self.evaluate_agent_pair(agent_pair, display=display)

    def evaluate_human_model_pair(self, display=True, num_games=1):
        a0 = GreedyHumanModel(self.mlp)
        a1 = GreedyHumanModel(self.mlp)
        agent_pair = AgentPair(a0, a1)
        return self.evaluate_agent_pair(agent_pair,
                                        display=display,
                                        num_games=num_games)

    def evaluate_optimal_pair(self, display=True, delivery_horizon=2):
        a0 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon)
        a1 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon)
        a0.mlp.env = self.env
        a1.mlp.env = self.env
        agent_pair = AgentPair(a0, a1)
        return self.evaluate_agent_pair(agent_pair, display=display)

    def evaluate_one_optimal_one_random(self, display=True):
        a0 = CoupledPlanningAgent(self.mlp)
        a1 = RandomAgent()
        agent_pair = AgentPair(a0, a1)
        return self.evaluate_agent_pair(agent_pair, display=display)

    def evaluate_one_optimal_one_greedy_human(self, h_idx=0, display=True):
        h = GreedyHumanModel(self.mlp)
        r = CoupledPlanningAgent(self.mlp)
        agent_pair = AgentPair(h, r) if h_idx == 0 else AgentPair(r, h)
        return self.evaluate_agent_pair(agent_pair, display=display)

    def evaluate_agent_pair(self,
                            agent_pair,
                            num_games=1,
                            display=False,
                            info=True):
        self.env.reset()
        return self.env.get_rollouts(agent_pair,
                                     num_games,
                                     display=display,
                                     info=info)

    def get_agent_pair_trajs(self, a0, a1=None, num_games=100, display=False):
        """Evaluate agent pair on both indices, and return trajectories by index"""
        if a1 is None:
            ap = AgentPair(a0, a0, allow_duplicate_agents=True)
            trajs_0 = trajs_1 = self.evaluate_agent_pair(ap,
                                                         num_games=num_games,
                                                         display=display)
        else:
            trajs_0 = self.evaluate_agent_pair(AgentPair(a0, a1),
                                               num_games=num_games,
                                               display=display)
            trajs_1 = self.evaluate_agent_pair(AgentPair(a1, a0),
                                               num_games=num_games,
                                               display=display)
        return trajs_0, trajs_1

    @staticmethod
    def check_trajectories(trajectories):
        """
        Checks that of trajectories are in standard format and are consistent with dynamics of mdp.
        """
        AgentEvaluator._check_standard_traj_keys(set(trajectories.keys()))
        AgentEvaluator._check_right_types(trajectories)
        AgentEvaluator._check_trajectories_dynamics(trajectories)
        # TODO: Check shapes?

    @staticmethod
    def _check_standard_traj_keys(traj_keys_set):
        assert traj_keys_set == set(
            DEFAULT_TRAJ_KEYS
        ), "Keys of traj dict did not match standard form.\nMissing keys: {}\nAdditional keys: {}".format(
            [k for k in DEFAULT_TRAJ_KEYS if k not in traj_keys_set],
            [k for k in traj_keys_set if k not in DEFAULT_TRAJ_KEYS])

    @staticmethod
    def _check_right_types(trajectories):
        for idx in range(len(trajectories["ep_observations"])):
            states, actions, rewards = trajectories["ep_observations"][
                idx], trajectories["ep_actions"][idx], trajectories[
                    "ep_rewards"][idx]
            mdp_params, env_params = trajectories["mdp_params"][
                idx], trajectories["env_params"][idx]
            assert all(type(j_a) is tuple for j_a in actions)
            assert all(type(s) is OvercookedState for s in states)
            assert type(mdp_params) is dict
            assert type(env_params) is dict
            # TODO: check that are all lists

    @staticmethod
    def _check_trajectories_dynamics(trajectories):
        _, envs = AgentEvaluator.mdps_and_envs_from_trajectories(trajectories)

        for idx in range(len(trajectories["ep_observations"])):
            states, actions, rewards = trajectories["ep_observations"][
                idx], trajectories["ep_actions"][idx], trajectories[
                    "ep_rewards"][idx]
            simulation_env = envs[idx]

            assert len(states) == len(actions) == len(
                rewards), "# states {}\t# actions {}\t# rewards {}".format(
                    len(states), len(actions), len(rewards))

            # Checking that actions would give rise to same behaviour in current MDP
            for i in range(len(states) - 1):
                curr_state = states[i]
                simulation_env.state = curr_state

                next_state, reward, done, info = simulation_env.step(
                    actions[i])

                assert states[
                    i +
                    1] == next_state, "States differed (expected vs actual): {}".format(
                        simulation_env.display_states(states[i + 1],
                                                      next_state))
                assert rewards[i] == reward, "{} \t {}".format(
                    rewards[i], reward)

    @staticmethod
    def mdps_and_envs_from_trajectories(trajectories):
        mdps, envs = [], []
        for idx in range(len(trajectories["ep_lengths"])):
            mdp_params, env_params = trajectories["mdp_params"][
                idx], trajectories["env_params"][idx]
            mdp = OvercookedGridworld.from_layout_name(**mdp_params)
            env = OvercookedEnv(mdp, **env_params)
            mdps.append(mdp)
            envs.append(env)
        return mdps, envs

    ### I/O METHODS ###

    @staticmethod
    def save_trajectory(trajectory, filename):
        AgentEvaluator.check_trajectories(trajectory)
        save_pickle(trajectory, filename)

    @staticmethod
    def load_trajectory(filename):
        traj = load_pickle(filename)
        AgentEvaluator.check_trajectories(traj)
        return traj

    @staticmethod
    def save_traj_in_stable_baselines_format(rollout_trajs, filename):
        # Converting episode dones to episode starts
        eps_starts = [
            np.zeros(len(traj)) for traj in rollout_trajs["ep_dones"]
        ]
        for ep_starts in eps_starts:
            ep_starts[0] = 1
        eps_starts = [ep_starts.astype(np.bool) for ep_starts in eps_starts]

        stable_baselines_trajs_dict = {
            'actions': np.concatenate(rollout_trajs["ep_actions"]),
            'obs': np.concatenate(rollout_trajs["ep_observations"]),
            'rewards': np.concatenate(rollout_trajs["ep_rewards"]),
            'episode_starts': np.concatenate(eps_starts),
            'episode_returns': rollout_trajs["ep_returns"]
        }
        stable_baselines_trajs_dict = {
            k: np.array(v)
            for k, v in stable_baselines_trajs_dict.items()
        }
        np.savez(filename, **stable_baselines_trajs_dict)

    @staticmethod
    def save_traj_as_json(trajectory, filename):
        """Saves the `idx`th trajectory as a list of state action pairs"""
        assert set(DEFAULT_TRAJ_KEYS) == set(
            trajectory.keys()), "{} vs\n{}".format(DEFAULT_TRAJ_KEYS,
                                                   trajectory.keys())
        AgentEvaluator.check_trajectories(trajectory)
        trajectory = AgentEvaluator.make_trajectories_json_serializable(
            trajectory)
        save_as_json(trajectory, filename)

    @staticmethod
    def make_trajectories_json_serializable(trajectories):
        """
        Cannot convert np.arrays or special types of ints to JSON.
        This method converts all components of a trajectory to standard types.
        """
        dict_traj = copy.deepcopy(trajectories)
        dict_traj["ep_observations"] = [[
            ob.to_dict() for ob in one_ep_obs
        ] for one_ep_obs in trajectories["ep_observations"]]
        for k in dict_traj.keys():
            dict_traj[k] = list(dict_traj[k])
        dict_traj['ep_actions'] = [
            list(lst) for lst in dict_traj['ep_actions']
        ]
        dict_traj['ep_rewards'] = [
            list(lst) for lst in dict_traj['ep_rewards']
        ]
        dict_traj['ep_dones'] = [int(lst) for lst in dict_traj['ep_dones']]
        dict_traj['ep_returns'] = [int(val) for val in dict_traj['ep_returns']]
        dict_traj['ep_lengths'] = [int(val) for val in dict_traj['ep_lengths']]
        return dict_traj

    @staticmethod
    def load_traj_from_json(filename):
        traj_dict = load_from_json(filename)
        traj_dict["ep_observations"] = [[
            OvercookedState.from_dict(ob) for ob in curr_ep_obs
        ] for curr_ep_obs in traj_dict["ep_observations"]]
        traj_dict["ep_actions"] = [[
            tuple(tuple(a) if type(a) is list else a for a in j_a)
            for j_a in ep_acts
        ] for ep_acts in traj_dict["ep_actions"]]
        return traj_dict

    ### VIZUALIZATION METHODS ###

    @staticmethod
    def interactive_from_traj(trajectories, traj_idx=0):
        """
        Displays ith trajectory of trajectories (in standard format) 
        interactively in a Jupyter notebook.
        """
        from ipywidgets import widgets, interactive_output

        states = trajectories["ep_observations"][traj_idx]
        joint_actions = trajectories["ep_actions"][traj_idx]
        cumulative_rewards = cumulative_rewards_from_rew_list(
            trajectories["ep_rewards"][traj_idx])
        mdp_params = trajectories["mdp_params"][traj_idx]
        env_params = trajectories["env_params"][traj_idx]
        env = AgentEvaluator(mdp_params, env_params=env_params).env

        def update(t=1.0):
            env.state = states[int(t)]
            joint_action = joint_actions[int(t -
                                             1)] if t > 0 else (Action.STAY,
                                                                Action.STAY)
            print(env)
            print("Joint Action: {} \t Score: {}".format(
                Action.joint_action_to_char(joint_action),
                cumulative_rewards[t]))

        t = widgets.IntSlider(min=0, max=len(states) - 1, step=1, value=0)
        out = interactive_output(update, {'t': t})
        display(out, t)