Exemplo n.º 1
0
    def __init__(self, config):
        self.cur_pos = 0
        self.action_space = Discrete(4)

        # Represents an item.
        self.item_space = Discrete(5)

        # Represents an effect on the player.
        self.effect_space = Box(9000, 9999, shape=(4, ))

        # Represents a player.
        self.player_space = Dict({
            "location":
            Box(-100, 100, shape=(2, )),
            "status":
            Box(-1, 1, shape=(10, )),
            "items":
            Repeated(self.item_space, max_len=MAX_ITEMS),
            "effects":
            Repeated(self.effect_space, max_len=MAX_EFFECTS),
        })

        # Observation is a list of players.
        self.observation_space = Repeated(self.player_space,
                                          max_len=MAX_PLAYERS)
Exemplo n.º 2
0
    def make_obs_space(embed_dim=768,
                       max_steps=None,
                       max_utterances=5,
                       max_command_length=5,
                       max_variables=10,
                       max_actions=10,
                       **kwargs):
        true_obs = {
            'dialog_history':
            Repeated(Dict({
                'sender': Discrete(3),
                'utterance': Box(-10, 10, shape=(embed_dim, ))
            }),
                     max_len=max_utterances),
            'partial_command':
            Repeated(Box(-10, 10, shape=(embed_dim, )),
                     max_len=max_command_length),
            'variables':
            Repeated(Box(-10, 10, shape=(embed_dim, )), max_len=max_variables),
        }
        if max_steps:
            true_obs['steps'] = Discrete(max_steps)

        # return Dict(true_obs) For calculating true_obs_shsape

        return Dict({
            "true_obs":
            Dict(true_obs),
            '_action_mask':
            MultiDiscrete([2 for _ in range(max_actions)]),
            '_action_embeds':
            Box(-10, 10, shape=(max_actions, embed_dim)),
        })
Exemplo n.º 3
0
    def test_repeated(self):
        space = Repeated(gym.spaces.Box(low=-1, high=1, shape=(1, 200)), max_len=8)

        d = gym_space_to_dict(space)
        sp = gym_space_from_dict(d)

        self.assertTrue(isinstance(sp.child_space, gym.spaces.Box))
        self.assertEqual(space.max_len, sp.max_len)
        self.assertEqual(space.dtype, sp.dtype)
Exemplo n.º 4
0
class SimpleRPG(gym.Env):
    """Example of a custom env with a complex, structured observation.

    The observation is a list of players, each of which is a Dict of
    attributes, and may further hold a list of items (categorical space).

    Note that the env doesn't train, it's just a dummy example to show how to
    use spaces.Repeated in a custom model (see CustomRPGModel below).
    """
    def __init__(self, config):
        self.cur_pos = 0
        self.action_space = Discrete(4)

        # Represents an item.
        self.item_space = Discrete(5)

        # Represents an effect on the player.
        self.effect_space = Box(9000, 9999, shape=(4, ))

        # Represents a player.
        self.player_space = Dict({
            "location":
            Box(-100, 100, shape=(2, )),
            "status":
            Box(-1, 1, shape=(10, )),
            "items":
            Repeated(self.item_space, max_len=MAX_ITEMS),
            "effects":
            Repeated(self.effect_space, max_len=MAX_EFFECTS),
        })

        # Observation is a list of players.
        self.observation_space = Repeated(self.player_space,
                                          max_len=MAX_PLAYERS)

    def reset(self):
        return self.observation_space.sample()

    def step(self, action):
        return self.observation_space.sample(), 1, True, {}
Exemplo n.º 5
0
    def __init__(self, environment):
        self.environment = environment
        self.machine_state = MultiDiscrete([2] *
                                           self.environment.schedule_length)
        self.job_state = Repeated(Box(
            low=-self.environment.max_steps_per_iterations,
            high=self.environment.max_steps_per_iterations,
            shape=(3, ),
            dtype=np.int),
                                  max_len=self.environment.max_job_slots)

        self.observation_space = Dict({
            'machine_state': self.machine_state,
            'job_state': self.job_state
        })
Exemplo n.º 6
0
 def _repeated(d: Dict) -> Repeated:
     child_space = gym_space_from_dict(d["child_space"])
     return Repeated(child_space=child_space, max_len=d["max_len"])
Exemplo n.º 7
0
                  spaces.Box(low=0, high=1, shape=(10, 10, 3)))),
    spaces.Discrete(5),
])
TUPLE_SAMPLES = [TUPLE_SPACE.sample() for _ in range(10)]

# Constraints on the Repeated space.
MAX_PLAYERS = 4
MAX_ITEMS = 7
MAX_EFFECTS = 2
ITEM_SPACE = spaces.Box(-5, 5, shape=(1, ))
EFFECT_SPACE = spaces.Box(9000, 9999, shape=(4, ))
PLAYER_SPACE = spaces.Dict({
    "location":
    spaces.Box(-100, 100, shape=(2, )),
    "items":
    Repeated(ITEM_SPACE, max_len=MAX_ITEMS),
    "effects":
    Repeated(EFFECT_SPACE, max_len=MAX_EFFECTS),
    "status":
    spaces.Box(-1, 1, shape=(10, )),
})
REPEATED_SPACE = Repeated(PLAYER_SPACE, max_len=MAX_PLAYERS)
REPEATED_SAMPLES = [REPEATED_SPACE.sample() for _ in range(10)]


def one_hot(i, n):
    out = [0.0] * n
    out[i] = 1.0
    return out

Exemplo n.º 8
0
class NanoworldEnv(MultiAgentEnv):
    # Constants
    agents = ('passenger', 'driver')

    max_num_actions = 8
    destination = ["", "starbucks", "peets"]

    # Action spaces
    passenger_actions = [
        "wait for driver", "say starbucks", "say peets", "mental starbucks",
        "mental peets"
    ]
    # passenger_actions = ["wait for driver", "say starbucks", "say peets"]
    passenger_action_space = Discrete(len(passenger_actions))

    driver_actions = ["wait for passenger", "drive starbucks", "drive peets"]
    driver_action_space = Discrete(len(driver_actions))

    # observation spaces
    # wait, say starbucks, say peets -- can be repeated at most 4 times +
    # mental state (none, starbucks, peets)
    passenger_observation_space = Dict({
        'dialog_history':
        Repeated(Discrete(3), max_len=max_num_actions),
        'destination':
        Discrete(3)
    })
    # wait, say starbucks, say peets -- can be repeated at most 4 times
    driver_observation_space = Dict(
        {'dialog_history': Repeated(Discrete(3), max_len=max_num_actions)})

    def __init__(self, config):
        destination_id = random.randint(1, 2)
        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.destination[destination_id])
        self.num_epidodes = 0
        # self.is_supervised = config['is_supervised']

    def reset(self):
        '''
        Called before each episode, returns the first observation
        '''
        if self.num_epidodes % 1000 == 0:
            logger.warning("completed {} episodes.".format(self.num_epidodes))

        if self.num_epidodes >= 10000:
            logger.warning('episode ' + str(self.num_epidodes))
            logger.warning('------------')
            _, _, history, _ = self.state.get_global_state()
            for h in history:
                logger.warning(h)
            logger.warning('-------------')
        self.num_epidodes += 1

        destination_id = random.randint(1, 2)
        if self.num_epidodes >= 10000:
            logger.warning('set destination: ' +
                           NanoworldEnv.destination[destination_id])
        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.destination[destination_id])
        self.obs = {
            'driver': self.state.make_driver_observation(),
            'passenger': self.state.make_passenger_observation()
        }
        return self.obs

    def driver_step(self, action):
        self.state.update_state(NanoworldEnv.driver_actions[action])
        obs = self.state.make_driver_observation()
        return obs

    def passenger_step(self, action):
        self.state.update_state(NanoworldEnv.passenger_actions[action])
        obs = self.state.make_passenger_observation()
        return obs

    def compute_passenger_reward(self):
        # if self.is_supervised:
        #     return self.compute_episode_reward_supervised()
        # else:
        return self.compute_episode_reward()

    def compute_driver_reward(self):
        # return self.compute_episode_reward()

        driver_reward = 0
        desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state(
        )
        if self.state.dialog_complete:  # to compute at the very end
            if driven_destination:
                if len(verbal_history
                       ) == 0:  # driver drives before user says anything
                    return -1
                else:
                    last_uttered_destination = verbal_history[-1].split(" ")[1]
                    if driven_destination == last_uttered_destination:
                        return 1
                    else:
                        return -1
            else:  # timeout
                return -10
        else:
            return 0

    def compute_episode_reward(self):
        desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state(
        )
        if self.state.dialog_complete:  # to compute at the very end
            if driven_destination:
                if desired_destination == driven_destination:
                    return 1
                else:
                    return -1
            else:  # timeout
                return -10
        else:
            return 0

    def step(self, action_dict):
        '''
        Given an action_dict, compute the next observation, rewards, and dones
        '''

        if 'driver' in action_dict:
            driver_obs = self.driver_step(action_dict['driver'])
            if self.state.is_done():
                driver_reward = self.compute_driver_reward()
                return {'driver': driver_obs, 'passenger': self.state.make_passenger_observation()}, \
                       {'driver': driver_reward, 'passenger': self.compute_passenger_reward()}, \
                       {'__all__': self.state.is_done()}, {}

        if 'passenger' in action_dict:
            passenger_obs = self.passenger_step(action_dict['passenger'])

        self.obs = {'driver': driver_obs, 'passenger': passenger_obs}
        self.rewards = {
            'driver': self.compute_driver_reward(),
            'passenger': self.compute_passenger_reward()
        }
        self.dones = {'__all__': self.state.is_done()}
        self.infos = {}
        return self.obs, self.rewards, self.dones, self.infos
Exemplo n.º 9
0
class NanoworldEnv(MultiAgentEnv):
    # Constants
    agents = ('passenger', 'driver')

    max_num_actions = 10
    parameters = [
        ".", "yes", "no", "starbucks", "peets"
    ]  # dummy destination '.' to be paired with 'OVER', 'YES', 'NO'
    # parameters = [
    #                ".",
    #                "yes",
    #                "no",
    #                "starbucks",
    #                "peets",
    #                "ralphs",
    #                "traderjoes",
    #                "wholefoods",
    #                "walmart",
    #                "cvs",
    #                "toysrus",
    #                "applestore",
    #                "bestbuy",
    #         ]

    # Action spaces
    passenger_actions = ["SAY", "OVER"]
    passenger_action_space = Tuple(
        [Discrete(len(passenger_actions)),
         Discrete(len(parameters))])

    driver_actions = ["CONFIRM", "DRIVE"]
    driver_action_space = Tuple(
        [Discrete(len(driver_actions)),
         Discrete(len(parameters))])

    # observation spaces
    passenger_observation_space = Dict({
        'dialog_history':
        Repeated(Discrete(len(agents)),
                 Tuple([
                     Discrete(len(passenger_actions)),
                     Discrete(len(parameters))
                 ]),
                 max_len=max_num_actions),
        'destination':
        Discrete(len(parameters))
    })
    driver_observation_space = Dict({
        'dialog_history':
        Repeated(Discrete(len(agents)),
                 Tuple([
                     Discrete(len(passenger_actions)),
                     Discrete(len(parameters))
                 ]),
                 max_len=max_num_actions)
    })

    perfect_dialogs = [
        # ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'), ('DRIVE', 'starbucks')]),
        # ("peets", [('SAY', 'peets'), ('OVER', '.'), ('DRIVE', 'peets')]),
        ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'),
                       ('CONFIRM', 'starbucks'), ('DRIVE', 'starbucks')]),
        ("peets", [('SAY', 'peets'), ('OVER', '.'), ('CONFIRM', 'peets'),
                   ('DRIVE', 'peets')]),

        # ("starbucks", [('SAY', 'starbucks'),
        #                ('OVER', '.'),
        #                ('CONFIRM', 'starbucks'),
        #                ('OVER', '.'),
        #                ('YES', '.'),
        #                ('OVER', '.'),
        #                ('DRIVE', 'starbucks')]),
        #
        # ("peets", [('SAY', 'peets'),
        #             ('OVER', '.'),
        #             ('CONFIRM', 'peets'),
        #             ('OVER', '.'),
        #             ('YES', '.'),
        #             ('OVER', '.'),
        #             ('DRIVE', 'peets')]),
    ]

    def __init__(self, config):
        self.is_supervised = False
        destination_id = random.randint(3, len(NanoworldEnv.parameters) - 1)
        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.parameters[destination_id])
        self.num_episodes = 0
        self.supervised_episodes = 10000
        self.rewards = dict()
        self.print_episodes = 10000

    def reset(self):
        '''
        Called before each episode, returns the first observation
        '''
        if self.num_episodes % 1000 == 0:
            logger.warning("completed {} episodes.".format(self.num_episodes))

        if self.num_episodes >= self.print_episodes:
            logger.warning('episode ' + str(self.num_episodes))
            logger.warning('------------')
            _, _, history, _ = self.state.get_global_state()
            for h in history:
                logger.warning(h)
            logger.warning('-------------')
        self.num_episodes += 1

        # select the destination

        if self.is_supervised and self.num_episodes < self.supervised_episodes:
            a_list = [3, 4]
            distribution = [.5, .5]
            destination_id = random.choices(a_list, distribution)[0]
        else:
            destination_id = random.randint(3,
                                            len(NanoworldEnv.parameters) - 1)

        if self.num_episodes >= self.print_episodes:
            logger.warning('set destination: ' +
                           NanoworldEnv.parameters[destination_id])

        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.parameters[destination_id])

        self.obs = {'passenger': self.state.make_passenger_observation()}
        return self.obs

    def driver_step(self, action):
        a1, a2 = action
        self.state.update_state(NanoworldEnv.driver_actions[a1],
                                NanoworldEnv.parameters[a2])
        obs = self.state.make_driver_observation()
        return obs

    def passenger_step(self, action):
        a1, a2 = action
        self.state.update_state(NanoworldEnv.passenger_actions[a1],
                                NanoworldEnv.parameters[a2])
        obs = self.state.make_passenger_observation()
        return obs

    def compute_driver_reward(self):
        driver_reward = 0
        _, verbal_history, _, driven_destination = self.state.get_global_state(
        )
        if self.state.is_done():  # to compute at the very end
            if self.state.dialog_complete:
                if driven_destination:  # completion through a final drive action
                    if len(verbal_history
                           ) == 0:  # driver drives before user says anything
                        driver_reward += -1
                    else:
                        last_uttered_destination = verbal_history[-1].split(
                            " ")[1]
                        if driven_destination == last_uttered_destination:
                            driver_reward += 1
                        else:
                            driver_reward += -1
            else:  # timeout
                driver_reward += -10
        else:  # dialog not yet over
            driver_reward += 0

        if self.is_supervised:  # and self.num_episodes < self.supervised_episodes:
            driver_reward += self.compositional_supervision_reward()

        return driver_reward

    def compute_passenger_reward(self):
        desired_destination, verbal_history, _, driven_destination = self.state.get_global_state(
        )
        passenger_reward = 0
        if self.state.is_done():  # to compute at the very end
            if self.state.dialog_complete:  # completion through a final drive action
                if desired_destination == driven_destination:
                    passenger_reward += 1
                else:
                    passenger_reward += -1
            else:  # timeout
                passenger_reward += -10
        else:  # dialog not yet over
            passenger_reward += 0

        if self.is_supervised:  # and self.num_episodes < self.supervised_episodes:
            passenger_reward += self.compositional_supervision_reward()

        return passenger_reward

    def compute_supervision_reward(self):
        desired_dest, _, all_actions, _ = self.state.get_global_state()
        dialog_so_far = ", ".join(all_actions)
        for dest, dialog_raw in NanoworldEnv.perfect_dialogs:
            dialog = ", ".join([a + " " + p for a, p in dialog_raw])
            if dest == desired_dest and dialog.startswith(
                    dialog_so_far):  # and len(all_actions) > 2:
                return 1
        return 0

    def compositional_supervision_reward(self):
        desired_dest, _, all_actions_sofar, _ = self.state.get_global_state()
        all_actions_sofar = " ".join(
            [action.split(" ")[0] for action in all_actions_sofar])
        perfect_dialog = " ".join(
            [x[0] for x in NanoworldEnv.perfect_dialogs[0][1]])
        if perfect_dialog.startswith(all_actions_sofar):
            return 1
        else:
            return 0

    # any kind of exploration is punished ... so negative reward in supervision is bad..
    def compute_supervision_reward_negative(self):
        desired_dest, _, all_actions, _ = self.state.get_global_state()
        dialog_so_far = ", ".join(all_actions)
        for dest, dialog_raw in NanoworldEnv.perfect_dialogs:
            dialog = ", ".join([a + " " + p for a, p in dialog_raw])
            if dest == desired_dest and dialog.startswith(
                    dialog_so_far):  # and len(all_actions) > 2:
                return 1
        return -1

    def step(self, action_dict):
        '''
        Given an action_dict, compute the next observation, rewards, and dones
        '''
        # pdb.set_trace()
        driver_obs = None
        passenger_obs = None

        if 'driver' in action_dict:
            driver_obs = self.driver_step(action_dict['driver'])
        if 'passenger' in action_dict:
            passenger_obs = self.passenger_step(action_dict['passenger'])

        if self.state.turn == 0:
            passenger_obs = self.state.make_passenger_observation()
            driver_obs = None
        elif self.state.turn == 1:
            driver_obs = self.state.make_driver_observation()
            passenger_obs = None

        self.obs = {}
        self.rewards = {}

        if passenger_obs:
            self.obs['passenger'] = passenger_obs
            self.rewards['passenger'] = self.compute_passenger_reward()

        if driver_obs:
            self.obs['driver'] = driver_obs
            self.rewards['driver'] = self.compute_driver_reward()

        self.dones = {'__all__': self.state.is_done()}

        if self.state.is_done():
            self.obs['passenger'] = self.state.make_passenger_observation()
            self.rewards['passenger'] = self.compute_passenger_reward()
            self.obs['driver'] = self.state.make_driver_observation()
            self.rewards['driver'] = self.compute_driver_reward()

        self.infos = {}

        return self.obs, self.rewards, self.dones, self.infos