예제 #1
0
class IdentityEnv(Env):
    def __init__(
            self,
            dim,
            ep_length=100,
    ):

        self.action_space = Discrete(dim)
        self.reset()

    def reset(self):
        self._choose_next_state()
        self.observation_space = self.action_space

        return self.state

    def step(self, actions):
        rew = self._get_reward(actions)
        self._choose_next_state()
        return self.state, rew, False, {}

    def _choose_next_state(self):
        self.state = self.action_space.sample()

    def _get_reward(self, actions):
        return 1 if self.state == actions else 0
예제 #2
0
    def __init__(
            self,
            dim,
            ep_length=100,
    ):

        self.action_space = Discrete(dim)
        self.reset()
예제 #3
0
class AdaptiveLearningEnv(gym.Env):

    metadata = { 'render.modes': ['human', 'rgb_array'] }

    def __init__(self, filename='activities.pkl'):
        self.filename = filename
        self.assets_dir = os.path.dirname(os.path.abspath(__file__))
        self.reward_range = (0, 1)
        self.viewer = None
        self.circle_indexs = []
        self.ob = None
        self._configure()
        self._seed()
        self._reset()

    def _configure(self):
        self._load_activities()
        self.action_space = Discrete(len(self.activities))
        self.observation_space = Box(0, 1, len(self.knowledges)) #
        self.simulator = StudentSimulator()

    def _load_activities(self):
        data_file = os.path.join(self.assets_dir, 'assets/%s' % self.filename)
        pkl_file = open(data_file, 'rb')
        self.knowledges = pickle.load(pkl_file)
        self.activities = pickle.load(pkl_file)
        pkl_file.close()

    def _step(self, action):
        assert self.action_space.contains(action)
        a = Activity(self.activities[action], self.knowledges)
        ob, reward, done = self.simulator.progress(self.ob, a)
        self.ob = ob
        return ob, reward, done, {}

    def _reset(self):
        self.ob = Box(0.1, 0.1, len(self.knowledges)).sample()
        return self.ob

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        screen_width = 600
        screen_height = 400
        radius = 10
        init_alpha = 0.1
        margin = radius * 2.5
        max_per_line = (screen_width - margin * 2) / margin
        colors = np.array([[78,191,126], [254,178,45], [175,101,194]])/255.

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)

            for (i, x) in enumerate(sorted(self.knowledges, key=lambda tup:(tup.level(), tup.group), reverse=True)):
                h, w = divmod(i, max_per_line)
                self.circle_indexs.append(x._id)
                w = screen_width - 20  - w * (margin + 10)
                h = screen_height - 20 - h * (margin + 10)
                t = self.viewer.draw_circle(radius)
                t.add_attr(rendering.Transform((w, h)))
                r, g, b = colors[x.level() - 1]
                t.set_color(r, g, b, init_alpha)
                self.viewer.add_geom(t)

        for i, x in enumerate(self.ob):
            if len(self.circle_indexs) != 0:
                t = self.viewer.geoms[self.circle_indexs.index(i)]
                k = self.knowledges[i]
                r, g, b = colors[k.level() - 1]
                t.set_color(r, g, b, x)
        return self.viewer.render(return_rgb_array = mode=='rgb_array')
예제 #4
0
 def action(self, state: Box, action_space: Discrete) -> int:
     if self._exploration_policy.should_explore():
         return action_space.sample()
     else:
         predict = self._model.predict(np.array([state]))
         return np.argmax(predict).item()
예제 #5
0
 def getActionSpace(self, agentIDs):
     actSpace = {}
     for agent in agentIDs:
         actSpace[agent] = Discrete(len(ACTION_MAP))
     #actSpace['state'] = Discrete(len(ACTION_MAP))
     return actSpace
예제 #6
0
    def __init__(self,
                 initial_stacks=100,
                 small_blind=1,
                 big_blind=2,
                 render=False,
                 funds_plot=True,
                 max_raising_rounds=2,
                 use_cpp_montecarlo=False):
        """
        The table needs to be initialized once at the beginning

        Args:
            num_of_players (int): number of players that need to be added
            initial_stacks (real): initial stacks per placyer
            small_blind (real)
            big_blind (real)
            render (bool): render table after each move in graphical format
            funds_plot (bool): show plot of funds history at end of each episode
            max_raising_rounds (int): max raises per round per player

        """
        if use_cpp_montecarlo:
            import cppimport
            calculator = cppimport.imp("tools.montecarlo_cpp.pymontecarlo")
            get_equity = calculator.montecarlo
        else:
            from tools.montecarlo_python import get_equity
        self.get_equity = get_equity
        self.use_cpp_montecarlo = use_cpp_montecarlo
        self.num_of_players = 0
        self.small_blind = small_blind
        self.big_blind = big_blind
        self.render_switch = render
        self.players = []
        self.table_cards = None
        self.dealer_pos = None
        self.player_status = []  # one hot encoded
        self.current_player = None
        self.player_cycle = None  # cycle iterator
        self.stage = None
        self.last_player_pot = None
        self.viewer = None
        self.player_max_win = None  # used for side pots
        self.second_round = False
        self.last_caller = None
        self.last_raiser = None
        self.raisers = []
        self.callers = []
        self.played_in_round = None
        self.min_call = None
        self.community_data = None
        self.player_data = None
        self.stage_data = None
        self.deck = None
        self.action = None
        self.winner_ix = None
        self.initial_stacks = initial_stacks
        self.acting_agent = None
        self.funds_plot = funds_plot
        self.max_round_raising = max_raising_rounds

        # pots
        self.community_pot = 0
        self.current_round_pot = 9
        self.player_pots = None  # individual player pots

        self.observation = None
        self.reward = None
        self.info = None
        self.done = False
        self.funds_history = None
        self.array_everything = None
        self.legal_moves = None
        self.illegal_move_reward = -1_000_000
        self.action_space = Discrete(len(Action) - 2)
        self.first_action_for_hand = None
예제 #7
0
파일: test_utils.py 프로젝트: AminHP/gym
from collections import OrderedDict

import numpy as np
import pytest

from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete, Tuple, utils

spaces = [
    Discrete(3),
    Box(low=0.0, high=np.inf, shape=(2, 2)),
    Box(low=0.0, high=np.inf, shape=(2, 2), dtype=np.float16),
    Tuple([Discrete(5), Discrete(10)]),
    Tuple([
        Discrete(5),
        Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32),
    ]),
    Tuple((Discrete(5), Discrete(2), Discrete(2))),
    MultiDiscrete([2, 2, 10]),
    MultiBinary(10),
    Dict({
        "position":
        Discrete(5),
        "velocity":
        Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32),
    }),
]

flatdims = [3, 4, 4, 15, 7, 9, 14, 10, 7]


@pytest.mark.parametrize(["space", "flatdim"], zip(spaces, flatdims))
예제 #8
0
파일: base.py 프로젝트: Shushman/metaworld
 def discretize_goal_space(self, goals):
     assert False
     assert len(goals) >= 1
     self.discrete_goals = goals
     # update the goal_space to a Discrete space
     self.discrete_goal_space = Discrete(len(self.discrete_goals))
예제 #9
0
class RockEnv(Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self,
                 board_size=7,
                 num_rocks=8,
                 use_heuristic=False,
                 observation='o',
                 stay_inside=False):
        """

        :param board_size: int board is a square of board_size x board_size
        :param num_rocks: int number of rocks on board
        :param use_heuristic: bool usage unclear
        :param observation: str must be one of
                                'o': observed value only
                                'po': position of the agent + the above
                                'poa': the above + the action taken
        """

        assert board_size in list(config.keys()) and \
               num_rocks == len(config[board_size]["rock_pos"])

        self.num_rocks = num_rocks
        self._use_heuristic = use_heuristic

        self._rock_pos = \
            [Coord(*rock) for rock in config[board_size]['rock_pos']]
        self._agent_pos = Coord(*config[board_size]['init_pos'])
        self.grid = Grid(board_size, board_size)

        for idx, rock in enumerate(self._rock_pos):
            self.grid.board[rock] = idx

        self.action_space = Discrete(len(Action) + self.num_rocks)
        self._discount = .95
        self._reward_range = 20
        self._penalization = -100
        self._query = 0
        if stay_inside:
            self._out_of_bounds_penalty = 0
        else:
            self._out_of_bounds_penalty = self._penalization

        self.state = None
        self.last_action = None
        self.done = False

        self.gui = None

        assert observation in ['o', 'oa', 'po', 'poa']
        if observation == 'o':
            self._make_obs = lambda obs, a: obs
            self.observation_space = Discrete(len(Obs))
        elif observation == 'oa':
            self._make_obs = self._oa
            self.observation_space =\
                Box(low=0,
                    high=np.append(max(Obs), np.ones(self.action_space.n)),
                    dtype=np.int)

        elif observation == 'po':
            self._make_obs = self._po
            self.observation_space = \
                Box(low=0,
                    high=np.append(np.ones(self.grid.n_tiles), max(Obs)),
                    dtype=np.int)

        elif observation == 'poa':
            self._make_obs = self._poa
            self.observation_space = \
                Box(low=0,
                    high=np.concatenate((np.ones(self.grid.n_tiles),
                                         [max(Obs)],
                                        np.ones(self.action_space.n))),
                    dtype=np.int)

    def seed(self, seed=None):
        np.random.seed(seed)

    def step(self, action: int):
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg
        assert self.done is False

        self.last_action = action
        self._query += 1

        reward = 0
        ob = Obs.NULL

        if action < Action.SAMPLE:
            if action == Action.EAST:
                if self.state.agent_pos.x + 1 < self.grid.x_size:
                    self.state.agent_pos += Moves.EAST.value
                else:
                    reward = 10
                    self.done = True
                    ob = self._make_obs(ob, action)
                    return ob, reward, self.done, {
                        "state": self._encode_state(self.state)
                    }
            elif action == Action.NORTH:
                if self.state.agent_pos.y + 1 < self.grid.y_size:
                    self.state.agent_pos += Moves.NORTH.value
                else:
                    reward = self._out_of_bounds_penalty
            elif action == Action.SOUTH:
                if self.state.agent_pos.y - 1 >= 0:
                    self.state.agent_pos += Moves.SOUTH.value
                else:
                    reward = self._out_of_bounds_penalty
            elif action == Action.WEST:
                if self.state.agent_pos.x - 1 >= 0:
                    self.state.agent_pos += Moves.WEST.value
                else:
                    reward = self._out_of_bounds_penalty
            else:
                raise NotImplementedError()

        if action == Action.SAMPLE:
            rock = self.grid[self.state.agent_pos]
            if rock >= 0 and not self.state.rocks[
                    rock].status == 0:  # collected
                if self.state.rocks[rock].status == 1:
                    reward = 10
                else:
                    reward = -10
                self.state.rocks[rock].status = 0
            else:
                reward = self._penalization

        if action > Action.SAMPLE:
            rock = action - Action.SAMPLE - 1
            assert rock < self.num_rocks

            ob = self._sample_ob(self.state.agent_pos, self.state.rocks[rock])

            self.state.rocks[rock].measured += 1

            eff = self._efficiency(self.state.agent_pos,
                                   self.state.rocks[rock].pos)

            if ob == Obs.GOOD:
                self.state.rocks[rock].count += 1
                self.state.rocks[rock].lkv *= eff
                self.state.rocks[rock].lkw *= (1 - eff)
            else:
                self.state.rocks[rock].count -= 1
                self.state.rocks[rock].lkw *= eff
                self.state.rocks[rock].lkv *= (1 - eff)

                denominator = (.5 * self.state.rocks[rock].lkv) + (
                    .5 * self.state.rocks[rock].lkw) + 1e-10
                self.state.rocks[rock].prob_valuable = \
                    (.5 * self.state.rocks[rock].lkv) / denominator

        self.done = self._penalization == reward
        ob = self._make_obs(ob, action)
        return ob, reward, self.done, {"state": self._encode_state(self.state)}

    def _decode_state(self, state, as_array=False):

        agent_pos = Coord(*state['agent_pos'])
        rock_state = RockState(agent_pos)
        for r in state['rocks']:
            rock = Rock(pos=0)
            rock.__dict__.update(r)
            rock_state.rocks.append(rock)

        if as_array:
            rocks = []
            for rock in rock_state.rocks:
                rocks.append(rock.status)

            return np.concatenate([[self.grid.get_index(agent_pos)], rocks])

        return rock_state

    @staticmethod
    def _encode_state(state):
        # use dictionary for state encoding

        return _encode_dict(state)
        # rocks can take 3 values: -1, 1, 0 if collected

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == "human":
            msg = None
            if self.gui is None:
                start_pos = self.grid.get_index(self.state.agent_pos)
                obj_pos = [(self.grid.get_index(rock.pos), rock.status)
                           for rock in self.state.rocks]
                self.gui = RockGui((self.grid.x_size, self.grid.y_size),
                                   start_pos=start_pos,
                                   obj=obj_pos)

            if self.last_action > Action.SAMPLE:
                rock = self.last_action - Action.SAMPLE - 1
                msg = "Rock S: {} P:{}".format(self.state.rocks[rock].status,
                                               self.state.rocks[rock].pos)
            agent_pos = self.grid.get_index(self.state.agent_pos)
            self.gui.render(agent_pos, msg)

    def reset(self):
        self.done = False
        self._query = 0
        self.last_action = Action.SAMPLE
        self.state = self._get_init_state(should_encode=False)
        return self._make_obs(Obs.NULL, self.last_action)

    def _set_state(self, state):
        self.done = False
        self.state = self._decode_state(state)

    def close(self):
        self.render(close=True)

    def _compute_prob(self, action, next_state, ob):

        next_state = self._decode_state(next_state)

        if action <= Action.SAMPLE:
            return int(ob == Obs.NULL)

        eff = self._efficiency(
            next_state.agent_pos,
            next_state.rocks[action - Action.SAMPLE - 1].pos)

        if ob == Obs.GOOD and next_state.rocks[action - Action.SAMPLE -
                                               1].status == 1:
            return eff
        elif ob == Obs.BAD and next_state.rocks[action - Action.SAMPLE -
                                                1].status == -1:
            return eff
        else:
            return 1 - eff

    def _get_init_state(self, should_encode=True):

        rock_state = RockState(self._agent_pos)
        for idx in range(self.num_rocks):
            rock_state.rocks.append(Rock(self._rock_pos[idx]))
        return self._encode_state(rock_state) if should_encode else rock_state

    def _generate_legal(self):
        legal = [Action.EAST]  # can always go east
        if self.state.agent_pos.y + 1 < self.grid.y_size:
            legal.append(Action.NORTH)

        if self.state.agent_pos.y - 1 >= 0:
            legal.append(Action.SOUTH)
        if self.state.agent_pos.x - 1 >= 0:
            legal.append(Action.WEST)

        rock = self.grid[self.state.agent_pos]
        if rock >= 0 and self.state.rocks[rock].status != 0:
            legal.append(Action.SAMPLE)

        for rock in self.state.rocks:
            assert self.grid[rock.pos] != -1
            if rock.status != 0:
                legal.append(self.grid[rock.pos] + 1 + Action.SAMPLE)
        return legal

    def _generate_preferred(self, history):
        if not self._use_heuristic:
            return self._generate_legal()

        actions = []

        # sample rocks with high likelihood of being good
        rock = self.grid[self.state.agent_pos]
        if rock >= 0 and self.state.rocks[rock].status != 0 and history.size:
            total = 0
            # history
            for t in range(history.size):
                if history[t].action == rock + 1 + Action.SAMPLE:
                    if history[t].ob == Obs.GOOD:
                        total += 1
                    elif history[t].ob == Obs.BAD:
                        total -= 1
            if total > 0:
                actions.append(Action.SAMPLE)
                return actions

        # process the rocks

        all_bad = True
        direction = {
            "north": False,
            "south": False,
            "west": False,
            "east": False
        }
        for idx in range(self.num_rocks):
            rock = self.state.rocks[idx]
            if rock.status != 0:
                total = 0
                for t in range(history.size):
                    if history[t].action == idx + 1 + Action.SAMPLE:
                        if history[t].ob == Obs.GOOD:
                            total += 1
                        elif history[t].ob == Obs.BAD:
                            total -= 1
                if total >= 0:
                    all_bad = False

                    if rock.pos.y > self.state.agent_pos.y:
                        direction['north'] = True
                    elif rock.pos.y < self.state.agent_pos.y:
                        direction['south'] = True
                    elif rock.pos.x < self.state.agent_pos.x:
                        direction['west'] = True
                    elif rock.pos.x > self.state.agent_pos.x:
                        direction['east'] = True

        if all_bad:
            actions.append(Action.EAST)
            return actions

        # generate a random legal move
        # do not measure a collected rock
        # do no measure a rock too often
        # do not measure clearly bad rocks
        # don't move in a direction that puts you closer to bad rocks
        # never sample a rock

        if self.state.agent_pos.y + 1 < self.grid.y_size and\
                direction['north']:
            actions.append(Action.NORTH)

        if direction['east']:
            actions.append(Action.EAST)

        if self.state.agent_pos.y - 1 >= 0 and direction['south']:
            actions.append(Action.SOUTH)

        if self.state.agent_pos.x - 1 >= 0 and direction['west']:
            actions.append(Action.WEST)

        for idx, rock in enumerate(self.state.rocks):
            if not rock.status == 0 and rock.measured < 5 and abs(
                    rock.count) < 2 and 0 < rock.prob_valuable < 1:
                actions.append(idx + 1 + Action.SAMPLE)

        if len(actions) == 0:
            return self._generate_legal()

        return actions

    def __dict2np__(self, state):
        idx = self.grid.get_index(Coord(*state['agent_pos']))
        rocks = []
        for rock in state['rocks']:
            rocks.append(rock['status'])
        return np.concatenate([[idx], rocks])

    @staticmethod
    def _efficiency(agent_pos, rock_pos, hed=20):
        # TODO check me
        d = Grid.euclidean_distance(agent_pos, rock_pos)
        eff = (1 + pow(2, -d / hed)) * .5
        return eff

    @staticmethod
    def _select_target(rock_state, x_size):
        best_dist = x_size * 2
        best_rock = -1  # Coord(-1, -1)
        for idx, rock in enumerate(rock_state.rocks):
            if rock.status != 0 and rock.count >= 0:
                d = Grid.manhattan_distance(rock_state.agent_pos, rock.pos)
                if d < best_dist:
                    best_dist = d
                    best_rock = idx  # rock.pos
        return best_rock

    @staticmethod
    def _sample_ob(agent_pos, rock, hed=20):
        eff = RockEnv._efficiency(agent_pos, rock.pos, hed=hed)
        if np.random.binomial(1, eff):
            return Obs.GOOD if rock.status == 1 else Obs.BAD
        else:
            return Obs.BAD if rock.status == 1 else Obs.GOOD

    def _po(self, o, _):
        obs = np.zeros(self.observation_space.shape[0])
        obs[self.grid.x_size * self.state.agent_pos.y +
            self.state.agent_pos.x] = 1.
        obs[self.grid.n_tiles] = o
        return obs

    def _poa(self, o, a):
        obs = self._po(o, a)
        obs[self.grid.n_tiles + a] = 1.
        return obs

    def _oa(self, o, a):
        obs = np.zeros(self.observation_space.shape[0])
        obs[0] = o
        obs[1 + a] = 1.
        return obs
예제 #10
0
class NanoworldEnv(MultiAgentEnv):
    # Constants
    agents = ('passenger', 'driver')

    max_num_actions = 10
    parameters = [
        ".", "yes", "no", "starbucks", "peets"
    ]  # dummy destination '.' to be paired with 'OVER', 'YES', 'NO'
    # parameters = [
    #                ".",
    #                "yes",
    #                "no",
    #                "starbucks",
    #                "peets",
    #                "ralphs",
    #                "traderjoes",
    #                "wholefoods",
    #                "walmart",
    #                "cvs",
    #                "toysrus",
    #                "applestore",
    #                "bestbuy",
    #         ]

    # Action spaces
    passenger_actions = ["SAY", "OVER"]
    passenger_action_space = Tuple(
        [Discrete(len(passenger_actions)),
         Discrete(len(parameters))])

    driver_actions = ["CONFIRM", "DRIVE"]
    driver_action_space = Tuple(
        [Discrete(len(driver_actions)),
         Discrete(len(parameters))])

    # observation spaces
    passenger_observation_space = Dict({
        'dialog_history':
        Repeated(Discrete(len(agents)),
                 Tuple([
                     Discrete(len(passenger_actions)),
                     Discrete(len(parameters))
                 ]),
                 max_len=max_num_actions),
        'destination':
        Discrete(len(parameters))
    })
    driver_observation_space = Dict({
        'dialog_history':
        Repeated(Discrete(len(agents)),
                 Tuple([
                     Discrete(len(passenger_actions)),
                     Discrete(len(parameters))
                 ]),
                 max_len=max_num_actions)
    })

    perfect_dialogs = [
        # ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'), ('DRIVE', 'starbucks')]),
        # ("peets", [('SAY', 'peets'), ('OVER', '.'), ('DRIVE', 'peets')]),
        ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'),
                       ('CONFIRM', 'starbucks'), ('DRIVE', 'starbucks')]),
        ("peets", [('SAY', 'peets'), ('OVER', '.'), ('CONFIRM', 'peets'),
                   ('DRIVE', 'peets')]),

        # ("starbucks", [('SAY', 'starbucks'),
        #                ('OVER', '.'),
        #                ('CONFIRM', 'starbucks'),
        #                ('OVER', '.'),
        #                ('YES', '.'),
        #                ('OVER', '.'),
        #                ('DRIVE', 'starbucks')]),
        #
        # ("peets", [('SAY', 'peets'),
        #             ('OVER', '.'),
        #             ('CONFIRM', 'peets'),
        #             ('OVER', '.'),
        #             ('YES', '.'),
        #             ('OVER', '.'),
        #             ('DRIVE', 'peets')]),
    ]

    def __init__(self, config):
        self.is_supervised = False
        destination_id = random.randint(3, len(NanoworldEnv.parameters) - 1)
        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.parameters[destination_id])
        self.num_episodes = 0
        self.supervised_episodes = 10000
        self.rewards = dict()
        self.print_episodes = 10000

    def reset(self):
        '''
        Called before each episode, returns the first observation
        '''
        if self.num_episodes % 1000 == 0:
            logger.warning("completed {} episodes.".format(self.num_episodes))

        if self.num_episodes >= self.print_episodes:
            logger.warning('episode ' + str(self.num_episodes))
            logger.warning('------------')
            _, _, history, _ = self.state.get_global_state()
            for h in history:
                logger.warning(h)
            logger.warning('-------------')
        self.num_episodes += 1

        # select the destination

        if self.is_supervised and self.num_episodes < self.supervised_episodes:
            a_list = [3, 4]
            distribution = [.5, .5]
            destination_id = random.choices(a_list, distribution)[0]
        else:
            destination_id = random.randint(3,
                                            len(NanoworldEnv.parameters) - 1)

        if self.num_episodes >= self.print_episodes:
            logger.warning('set destination: ' +
                           NanoworldEnv.parameters[destination_id])

        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.parameters[destination_id])

        self.obs = {'passenger': self.state.make_passenger_observation()}
        return self.obs

    def driver_step(self, action):
        a1, a2 = action
        self.state.update_state(NanoworldEnv.driver_actions[a1],
                                NanoworldEnv.parameters[a2])
        obs = self.state.make_driver_observation()
        return obs

    def passenger_step(self, action):
        a1, a2 = action
        self.state.update_state(NanoworldEnv.passenger_actions[a1],
                                NanoworldEnv.parameters[a2])
        obs = self.state.make_passenger_observation()
        return obs

    def compute_driver_reward(self):
        driver_reward = 0
        _, verbal_history, _, driven_destination = self.state.get_global_state(
        )
        if self.state.is_done():  # to compute at the very end
            if self.state.dialog_complete:
                if driven_destination:  # completion through a final drive action
                    if len(verbal_history
                           ) == 0:  # driver drives before user says anything
                        driver_reward += -1
                    else:
                        last_uttered_destination = verbal_history[-1].split(
                            " ")[1]
                        if driven_destination == last_uttered_destination:
                            driver_reward += 1
                        else:
                            driver_reward += -1
            else:  # timeout
                driver_reward += -10
        else:  # dialog not yet over
            driver_reward += 0

        if self.is_supervised:  # and self.num_episodes < self.supervised_episodes:
            driver_reward += self.compositional_supervision_reward()

        return driver_reward

    def compute_passenger_reward(self):
        desired_destination, verbal_history, _, driven_destination = self.state.get_global_state(
        )
        passenger_reward = 0
        if self.state.is_done():  # to compute at the very end
            if self.state.dialog_complete:  # completion through a final drive action
                if desired_destination == driven_destination:
                    passenger_reward += 1
                else:
                    passenger_reward += -1
            else:  # timeout
                passenger_reward += -10
        else:  # dialog not yet over
            passenger_reward += 0

        if self.is_supervised:  # and self.num_episodes < self.supervised_episodes:
            passenger_reward += self.compositional_supervision_reward()

        return passenger_reward

    def compute_supervision_reward(self):
        desired_dest, _, all_actions, _ = self.state.get_global_state()
        dialog_so_far = ", ".join(all_actions)
        for dest, dialog_raw in NanoworldEnv.perfect_dialogs:
            dialog = ", ".join([a + " " + p for a, p in dialog_raw])
            if dest == desired_dest and dialog.startswith(
                    dialog_so_far):  # and len(all_actions) > 2:
                return 1
        return 0

    def compositional_supervision_reward(self):
        desired_dest, _, all_actions_sofar, _ = self.state.get_global_state()
        all_actions_sofar = " ".join(
            [action.split(" ")[0] for action in all_actions_sofar])
        perfect_dialog = " ".join(
            [x[0] for x in NanoworldEnv.perfect_dialogs[0][1]])
        if perfect_dialog.startswith(all_actions_sofar):
            return 1
        else:
            return 0

    # any kind of exploration is punished ... so negative reward in supervision is bad..
    def compute_supervision_reward_negative(self):
        desired_dest, _, all_actions, _ = self.state.get_global_state()
        dialog_so_far = ", ".join(all_actions)
        for dest, dialog_raw in NanoworldEnv.perfect_dialogs:
            dialog = ", ".join([a + " " + p for a, p in dialog_raw])
            if dest == desired_dest and dialog.startswith(
                    dialog_so_far):  # and len(all_actions) > 2:
                return 1
        return -1

    def step(self, action_dict):
        '''
        Given an action_dict, compute the next observation, rewards, and dones
        '''
        # pdb.set_trace()
        driver_obs = None
        passenger_obs = None

        if 'driver' in action_dict:
            driver_obs = self.driver_step(action_dict['driver'])
        if 'passenger' in action_dict:
            passenger_obs = self.passenger_step(action_dict['passenger'])

        if self.state.turn == 0:
            passenger_obs = self.state.make_passenger_observation()
            driver_obs = None
        elif self.state.turn == 1:
            driver_obs = self.state.make_driver_observation()
            passenger_obs = None

        self.obs = {}
        self.rewards = {}

        if passenger_obs:
            self.obs['passenger'] = passenger_obs
            self.rewards['passenger'] = self.compute_passenger_reward()

        if driver_obs:
            self.obs['driver'] = driver_obs
            self.rewards['driver'] = self.compute_driver_reward()

        self.dones = {'__all__': self.state.is_done()}

        if self.state.is_done():
            self.obs['passenger'] = self.state.make_passenger_observation()
            self.rewards['passenger'] = self.compute_passenger_reward()
            self.obs['driver'] = self.state.make_driver_observation()
            self.rewards['driver'] = self.compute_driver_reward()

        self.infos = {}

        return self.obs, self.rewards, self.dones, self.infos
예제 #11
0
import gym
from gym.spaces import Box, Discrete, Tuple, Dict
from gym.envs.registration import EnvSpec
import numpy as np
import sys

import ray
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.test.test_multi_agent_env import MultiCartpole, MultiMountainCar
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.tune.registry import register_env

ACTION_SPACES_TO_TEST = {
    "discrete":
    Discrete(5),
    "vector":
    Box(-1.0, 1.0, (5, ), dtype=np.float32),
    "tuple":
    Tuple([Discrete(2),
           Discrete(3),
           Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
}

OBSERVATION_SPACES_TO_TEST = {
    "discrete":
    Discrete(5),
    "vector":
    Box(-1.0, 1.0, (5, ), dtype=np.float32),
    "image":
    Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32),
예제 #12
0
    print(f"Running with following CLI args: {args}")
    return args


if __name__ == "__main__":
    args = get_cli_args()

    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    # main part: configure the ActionMaskEnv and ActionMaskModel
    config = {
        # random env with 100 discrete actions and 5x [-1,1] observations
        # some actions are declared invalid and lead to errors
        "env": ActionMaskEnv,
        "env_config": {
            "action_space": Discrete(100),
            "observation_space": Box(-1.0, 1.0, (5, )),
        },
        # the ActionMaskModel retrieves the invalid actions and avoids them
        "model": {
            "custom_model": ActionMaskModel
            if args.framework != "torch" else TorchActionMaskModel,
            # disable action masking according to CLI
            "custom_model_config": {
                "no_masking": args.no_masking
            },
        },
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": args.framework,
        # Run with tracing enabled for tfe/tf2?
예제 #13
0
 def action_space(self):
     return Discrete(5)
예제 #14
0
class ConnectFourEnv(Env):
    r"""
    An adversarial environment for playing the `Connect-Four game
    <https://en.wikipedia.org/wiki/Connect_Four>`_.

    Attributes
    ----------
    action_space : gym.spaces.Discrete(7)
        The action space.

    observation_space : MultiDiscrete(nvec)

        The state observation space, representing the position of the current
        player's tokens (``s[1:,:,0]``) and the other player's tokens
        (``s[1:,:,1]``) as well as a mask over the space of actions, indicating
        which actions are available to the current player (``s[0,:,0]``) or the
        other player (``s[0,:,1]``).

        **Note:** The "current" player is relative to whose turn it is, which
        means that the entries ``s[:,:,0]`` and ``s[:,:,1]`` swap between
        turns.

    max_time_steps : int
        Maximum number of timesteps within each episode.

    available_actions : array of int
        Array of available actions. This list shrinks when columns saturate.

    win_reward : 1.0
        The reward associated with a win.

    loss_reward : -1.0
        The reward associated with a loss.

    draw_reward : 0.0
        The reward associated with a draw.

    """  # noqa: E501
    # class attributes
    num_rows = 6
    num_cols = 7
    num_players = 2
    win_reward = 1.0
    loss_reward = -win_reward
    draw_reward = 0.0
    action_space = Discrete(num_cols)
    observation_space = MultiDiscrete(
        nvec=np.full((num_rows + 1, num_cols, num_players), 2, dtype='uint8'))
    max_time_steps = int(num_rows * num_cols)
    filters = np.array([
        [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1]],
        [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [0, 0, 0, 0]],
        [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]],
        [[0, 0, 0, 0], [1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0]],
        [[1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
        [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]],
        [[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]],
        [[0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0]],
        [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]],
    ],
                       dtype='uint8')

    def __init__(self):
        self._init_state()

    def reset(self):
        r"""
        Reset the environment to the starting position.

        Returns
        -------
        s : 3d-array, shape: [num_rows + 1, num_cols, num_players]

            A state observation, representing the position of the current
            player's tokens (``s[1:,:,0]``) and the other player's tokens
            (``s[1:,:,1]``) as well as a mask over the space of actions,
            indicating which actions are available to the current player
            (``s[0,:,0]``) or the other player (``s[0,:,1]``).

            **Note:** The "current" player is relative to whose turn it is,
            which means that the entries ``s[:,:,0]`` and ``s[:,:,1]`` swap
            between turns.

        """
        self._init_state()
        return self.state

    def step(self, a):
        r"""
        Take one step in the MDP, following the single-player convention from
        gym.

        Parameters
        ----------
        a : int, options: {0, 1, 2, 3, 4, 5, 6}
            The action to be taken. The action is the zero-based count of the
            possible insertion slots, starting from the left of the board.

        Returns
        -------
        s_next : array, shape [6, 7, 2]

            A next-state observation, representing the position of the current
            player's tokens (``s[1:,:,0]``) and the other player's tokens
            (``s[1:,:,1]``) as well as a mask over the space of actions,
            indicating which actions are available to the current player
            (``s[0,:,0]``) or the other player (``s[0,:,1]``).

            **Note:** The "current" player is relative to whose turn it is,
            which means that the entries ``s[:,:,0]`` and ``s[:,:,1]`` swap
            between turns.

        r : float
            Reward associated with the transition
            :math:`(s, a)\to s_\text{next}`.

            **Note:** Since "current" player is relative to whose turn it is,
            you need to be careful about aligning the rewards with the correct
            state or state-action pair. In particular, this reward :math:`r` is
            the one associated with the :math:`s` and :math:`a`, i.e. *not*
            aligned with :math:`s_\text{next}`.

        done : bool
            Whether the episode is done.

        info : dict or None
            A dict with some extra information (or None).

        """
        if self.done:
            raise EpisodeDoneError("please reset env to start new episode")
        if not self.action_space.contains(a):
            raise ValueError(f"invalid action: {repr(a)}")
        if a not in self.available_actions:
            raise UnavailableActionError("action is not available")

        # swap players
        self._players = np.roll(self._players, -1)

        # update state
        self._state[self._levels[a], a] = self._players[0]
        self._prev_action = a

        # run logic
        self.done, reward = self._done_reward(a)
        return self.state, reward, self.done, {'state_id': self.state_id}

    def render(self, *args, **kwargs):
        r"""
        Render the current state of the environment.

        """
        # lookup for symbols
        symbol = {
            1: u'\u25CF',  # player 1 token (agent)
            2: u'\u25CB',  # player 2 token (adversary)
            -1: u'\u25BD',  # indicator for player 1's last action
            -2: u'\u25BC',  # indicator for player 2's last action
        }

        # render board
        hrule = '+---' * self.num_cols + '+\n'
        board = "  "
        board += "   ".join(
            symbol.get(-(a == self._prev_action) * self._players[1], " ")
            for a in range(self.num_cols))
        board += "  \n"
        board += hrule
        for i in range(self.num_rows):
            board += "| "
            board += " | ".join(
                symbol.get(self._state[i, j], " ")
                for j in range(self.num_cols))
            board += " |\n"
            board += hrule
        board += "  0   1   2   3   4   5   6  \n"  # actions

        print(board)

    @property
    def state(self):
        stacked_layers = np.stack(
            (
                (self._state == self._players[0]).astype('uint8'),
                (self._state == self._players[1]).astype('uint8'),
            ),
            axis=-1)  # shape: [num_rows, num_cols, num_players]
        available_actions_mask = np.zeros((1, self.num_cols, self.num_players),
                                          dtype='uint8')
        available_actions_mask[0, self.available_actions, :] = 1
        return np.concatenate((available_actions_mask, stacked_layers), axis=0)

    @property
    def state_id(self):
        p = str(self._players[0])
        d = '1' if self.done else '0'
        if self._prev_action is None:
            a = str(self.num_cols)
        else:
            a = str(self._prev_action)
        s = ''.join(self._state.ravel().astype('str'))  # base-3 string
        s = '{:017x}'.format(int(s, 3))  # 17-char hex string
        return p + d + a + s  # 20-char hex string

    def set_state(self, state_id):
        # decode state id
        p = int(state_id[0], 16)
        d = int(state_id[1], 16)
        a = int(state_id[2], 16)
        assert p in (1, 2)
        assert d in (0, 1)
        assert self.action_space.contains(a) or a == self.num_cols
        self._players[0] = p  # 1 or 2
        self._players[1] = 3 - p  # 2 or 1
        self.done = d == 1
        self._prev_action = None if a == self.num_cols else a
        s = np._base_repr(int(state_id[3:], 16), 3)
        z = np.zeros(self.num_rows * self.num_cols, dtype='uint8')
        z[-len(s):] = np.array(list(s), dtype='uint8')
        self._state = z.reshape((self.num_rows, self.num_cols))
        self._levels = np.full(self.num_cols, self.num_rows - 1, dtype='uint8')
        for j in range(self.num_cols):
            for i in self._state[::-1, j]:
                if i == 0:
                    break
                self._levels[j] -= 1

    @property
    def available_actions(self):
        actions = np.argwhere((self._levels >= 0)
                              & (self._levels < self.num_rows)).ravel()
        assert actions.size <= self.num_cols
        return actions

    @property
    def available_actions_mask(self):
        mask = np.zeros(self.num_cols, dtype='bool')
        mask[self.available_actions] = True
        return mask

    def _init_state(self):
        self._prev_action = None
        self._players = np.array([1, 2], dtype='uint8')
        self._state = np.zeros((self.num_rows, self.num_cols), dtype='uint8')
        self._levels = np.full(self.num_cols, self.num_rows - 1, dtype='uint8')
        self.done = False

    def _done_reward(self, a):
        r"""
        Check whether the last action `a` by the current player resulted in a
        win or draw for player 1 (the agent). This contains the main logic and
        implements the rules of the game.

        """
        assert self.action_space.contains(a)

        # update filling levels
        self._levels[a] -= 1

        s = self._state == self._players[0]
        for i0 in range(2, -1, -1):
            i1 = i0 + 4
            for j0 in range(4):
                j1 = j0 + 4
                if np.any(np.tensordot(self.filters, s[i0:i1, j0:j1]) == 4):
                    return True, 1.0

        # check for a draw
        if len(self.available_actions) == 0:
            return True, 0.0

        # this is what's returned throughout the episode
        return False, 0.0
예제 #15
0
    def __init__(self, env_name, config, bootstrap_env=None):
        """
        :param env_name: Name of the environment to create
        :param config:  Configuration to use
        :param bootstra_env: Environment used for defining
        """

        self.tolerance = 0.5
        self.env_type = None
        self.env_name = env_name
        self.config = config
        

        if env_name == 'MAB':
            # Mario Brother environment
            raise NotImplementedError()

        elif env_name == 'combolock':
            # Deterministic Combination Lock

            self.env_type = GenerateEnvironmentWrapper.RL_ACID
            self.thread_safe = True

            assert config["obs_dim"] == 3 * config["horizon"] + 2, "Set obs_dim to -1 in config for auto selection"
            if bootstrap_env is not None:
                self.env = bootstrap_env
            else:
                self.env = CombinationLock(horizon=config["horizon"])

            # Reach both states at a given time step with probability at least 0.5 (minus some tolerance)
            self.homing_policy_validation_fn = lambda dist, step: str((0, step)) in dist and str((1, step)) in dist and \
                                                                  dist[str((0, step))] > 50 - self.tolerance and \
                                                                  dist[str((1, step))] > 50 - self.tolerance

        elif env_name == 'stochcombolock':
            # Stochastic Combination Lock

            self.env_type = GenerateEnvironmentWrapper.RL_ACID
            self.thread_safe = True

            if config["noise"] == "bernoulli":
                self.noise_type = Environment.BERNOULLI
                assert config["obs_dim"] == 4 * config["horizon"] + 3, "Set obs_dim to -1 in config for auto selection"
            elif config["noise"] == "gaussian":
                self.noise_type = Environment.GAUSSIAN
                assert config["obs_dim"] == 3 * config["horizon"] + 3, "Set obs_dim to -1 in config for auto selection"
            else:
                raise AssertionError("Unhandled noise type %r" % self.noise_type)

            if bootstrap_env is not None:
                self.env = bootstrap_env
            else:
                self.env = StochasticCombinationLock(horizon=config["horizon"], swap=0.5, noise_type=self.noise_type)

            # Reach the two states with probability at least 0.25 each and the third state with probability at least 0.5
            self.homing_policy_validation_fn = lambda dist, step: \
                str((0, step)) in dist and str((1, step)) in dist and str((2, step)) in dist and \
                dist[str((0, step))] + dist[str((1, step))] > 50 - self.tolerance and \
                dist[str((2, step))] > 50 - self.tolerance

        elif env_name == 'diabcombolock':
            # Diabolical Stochastic Combination Lock

            self.env_type = GenerateEnvironmentWrapper.RL_ACID
            self.thread_safe = True
            self.trajectories = []
            self.trajectory_cntr = 0
            self.num_envs = 1

            if config["noise"] == "bernoulli":

                self.noise_type = Environment.BERNOULLI
                assert config["obs_dim"] == 2 * config["horizon"] + 4, "Set obs_dim to -1 in config for auto selection"

            elif config["noise"] == "gaussian":

                self.noise_type = Environment.GAUSSIAN
                assert config["obs_dim"] == config["horizon"] + 4, "Set obs_dim to -1 in config for auto selection"

            elif config["noise"] == "hadamhard":

                self.noise_type = Environment.HADAMHARD
                assert config["obs_dim"] == get_sylvester_hadamhard_matrix_dim(config["horizon"] + 4), \
                    "Set obs_dim to -1 in config for auto selection"

            elif config["noise"] == "hadamhardg":

                self.noise_type = Environment.HADAMHARDG
                assert config["obs_dim"] == get_sylvester_hadamhard_matrix_dim(config["horizon"] + 4), \
                    "Set obs_dim to -1 in config for auto selection"

            else:
                raise AssertionError("Unhandled noise type %r" % config["noise"])

            if bootstrap_env is not None:
                self.env = bootstrap_env
            else:
                self.env = DiabolicalCombinationLock(horizon=config["horizon"], swap=0.5,
                                                     num_actions=10, anti_shaping_reward=0.1,
                                                     noise_type=self.noise_type)

            self.action_space = Discrete(10)
            self.reward_range = (0.0,1.0)
            self.state_space = MultiBinary((config["horizon"]+1)*3)
            self.observation_space = Box(low=0.0, high=1.0, shape=(config["obs_dim"],),dtype=np.float)
            self.metadata = None
            setattr(self.observation_space, 'n', config["obs_dim"])

                
            # Reach the two states with probability at least 0.25 each and the third state with probability at least 0.5
            self.homing_policy_validation_fn = lambda dist, step: \
                str((0, step)) in dist and str((1, step)) in dist and str((2, step)) in dist and \
                dist[str((0, step))] + dist[str((1, step))] > 50 - self.tolerance and \
                dist[str((2, step))] > 50 - self.tolerance

        elif env_name == 'maze':
            # Maze world

            self.env_type = GenerateEnvironmentWrapper.RL_ACID
            self.thread_safe = True

            if bootstrap_env is not None:
                self.env = bootstrap_env
            else:
                self.env = RandomGridWorld(M=3, swap=0.1, dim=2, noise=0.0)

            self.homing_policy_validation_fn = None

        elif env_name == 'montezuma':
            # Montezuma Revenge

            self.env_type = GenerateEnvironmentWrapper.OpenAIGym
            self.thread_safe = True
            self.num_repeat_action = 4  # Repeat each action these many times.

            if bootstrap_env is not None:
                self.env = bootstrap_env
            else:
                self.env = gym.make('MontezumaRevengeDeterministic-v4')

            # Since we don't have access to underline state in this problem, we cannot define a validation function
            self.homing_policy_validation_fn = None

        elif env_name == 'gridworld' or env_name == 'gridworld-feat':
            # Grid World

            self.env_type = GenerateEnvironmentWrapper.GRIDWORLD
            self.thread_safe = True

            if bootstrap_env is not None:
                self.env = bootstrap_env
            else:
                self.env = GridWorld(num_grid_row=4, num_grid_col=4, horizon=config["horizon"], obs_dim=config["obs_dim"])

            reachable_states = self.env.get_reachable_states()
            num_states = self.env.get_num_states()

            self.homing_policy_validation_fn = lambda dist, step: all(
                [str(state) in dist and dist[str(state)] >= 1.0 / float(max(1, num_states)) - self.tolerance
                 for state in reachable_states[step]])

        else:
            raise AssertionError("Environment name %r not in RL Acid Environments " % env_name)
예제 #16
0
class FREnv(Env):
    """ Flamme Rouge Environment """

    TRACKS = tuple(track for track in ALL_TRACKS if len(track) == 78)

    game: Game
    _track: Optional[Track]
    track: Track
    opponents: Tuple[Team, ...] = (
        Peloton(colors="red"),
        Muscle(colors="green"),
        # Simple(colors='black'),
        Heuristic(colors="white"),
    )

    reward_range = (-1, len(opponents))
    action_space = Discrete(len(FRAction))
    observation_space = AvailableActions(
        nb_actions=action_space.n,
        space=Box(low=-1, high=77, shape=(524, )),
    )

    def __init__(
        self,
        team: Team,
        opponents: Optional[Tuple[Team, ...]] = None,
        track: Optional[Track] = None,
    ) -> None:
        super().__init__()
        self.team = team
        self.opponents = opponents or self.opponents
        self._track = track

    def _play_others(self) -> None:
        while True:
            teams = [
                team for team in self.game.active_teams if team != self.team
            ]
            if not teams:
                return
            team = choice(teams)
            team_action = team.select_action(self.game)
            assert team_action is not None
            self.game.take_action(team, team_action)

    def reset(self) -> np.ndarray:
        self.track = self._track if self._track is not None else choice(
            FREnv.TRACKS)
        teams = (self.team, ) + self.opponents
        self.game = Game(track=self.track, teams=teams)

        while self.game.phase is Phase.START:
            self.game.play_action()
        assert self.game.phase is Phase.RACE
        self._play_others()

        LOGGER.debug(self.game)

        return self.observation

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, dict]:
        assert not self.game.finished
        assert self.game.phase is Phase.RACE
        assert self.game.active_teams == (self.team, )

        try:
            act = _to_action(action, self.team)
            assert act is not None
            self.game.take_action(self.team, act)

        except Exception as exp:
            LOGGER.debug("encountered exception: %r", exp, exc_info=True)
            LOGGER.debug(
                "action: %d / %s / %s, available actions: %s",
                action,
                FR_ACTIONS[action],
                act,
                self.game.available_actions(self.team),
            )
            return self.observation, -1, True, {}

        if self.game.finished:
            winner = self.game.winner
            assert winner is not None
            assert winner.team is not None
            teams = self.game.sorted_teams
            assert teams[0] == winner.team
            position = teams.index(self.team) + 1
            reward = len(self.game.teams) - position
            return self.observation, reward, True, {}

        self._play_others()

        assert not self.game.finished
        assert self.game.phase is Phase.RACE
        assert self.game.active_teams == (self.team, )
        return self.observation, 0, False, {}

    def render(self, mode="human", close=False):
        print(self.game)

    def close(self):
        del self.game

    def seed(self, seed=None):
        pass

    def configure(self, *args, **kwargs):
        pass

    @property
    def observation(self):
        """ game observation """
        available = frozenset(
            filter(
                None,
                map(FRAction.from_action,
                    self.game.available_actions(self.team))))
        return {
            "actions": np.array([a in available for a in FR_ACTIONS],
                                dtype=bool),
            "values": FRData.from_game(self.game, self.team).to_array(),
        }
예제 #17
0
class StringGameEnvV1(Env):
    def __init__(self, max_steps=MAX_STEP):
        np.random.seed(123)
        torch.manual_seed(123)
        self.max_steps = max_steps
        self.reward_map = defaultdict(float)
        self.terminal_probs = defaultdict(float)
        self._init_reward_and_terminal_probs()
        self.recent_actions = deque([], maxlen=MAX_STEP)
        self.action_space = Discrete(ACTION_DIM)
        self.observation_space = Box(low=0, high=1, shape=(STATE_DIM, ))
        self.step_cnt = 0
        self.reset()

    def _init_reward_and_terminal_probs(self):
        self.reward_map["AAA"] = 5.0
        self.reward_map["BA"] = 4.0
        self.terminal_probs["A"] = 0.5
        self.terminal_probs["B"] = 0.1

    def seed(self, seed=None):
        np.random.seed(seed)
        torch.manual_seed(seed)

    @staticmethod
    def random_action():
        return np.random.randint(0, ACTION_DIM)

    def get_reward(self):
        """
        The function you can write to customize rewards. In this
        specific environment, the reward only depends on action history
        """
        recent_characters = [CHARACTERS[c] for c in list(self.recent_actions)]
        string = "".join(recent_characters)
        if not self.done:
            reward = 0
        else:
            reward = self.reward_map[string]
        return reward, string

    def step(self, action):
        assert self.action_space.contains(action)
        assert self.done is False

        self.step_cnt += 1
        self.recent_actions.append(action)
        if self.step_cnt >= self.max_steps:
            self.done = True
        else:
            self.done = self.sample_terminal(action)
        reward, info = self.get_reward()
        ob = self.get_observation()

        return ob, reward, self.done, {"reward_str": info}

    def sample_terminal(self, action):
        terminal_probability = self.terminal_probs[CHARACTERS[action]]
        if np.random.rand() < terminal_probability:
            return True
        return False

    def get_observation(self):
        """
        The function you can write to customize transitions. In this
        specific environment, the next state is exactly the latest action taken.
        The initial observation is all zeros.
        """
        ob = np.zeros(STATE_DIM)
        if len(self.recent_actions) > 0:
            ob[self.recent_actions[-1]] = 1
        return ob

    def reset(self):
        self.done = False
        self.recent_actions = deque([], maxlen=MAX_STEP)
        self.step_cnt = 0
        ob = self.get_observation()
        return ob

    def print_internal_state(self):
        action_str = "".join([CHARACTERS[c] for c in self.recent_actions])
        logger.debug(
            f"Step {self.step_cnt}, recent actions {action_str}, terminal={self.done}"
        )

    @staticmethod
    def print_ob(ob):
        return str(ob)

    @staticmethod
    def print_action(action):
        return CHARACTERS[action]
예제 #18
0
 def __init__(self):
     self.observation_space = Tuple(
         [Discrete(5),
          Box(0, 5, shape=(3, ), dtype=np.float32)])
예제 #19
0
    def __init__(self,
                 board_size=7,
                 num_rocks=8,
                 use_heuristic=False,
                 observation='o',
                 stay_inside=False):
        """

        :param board_size: int board is a square of board_size x board_size
        :param num_rocks: int number of rocks on board
        :param use_heuristic: bool usage unclear
        :param observation: str must be one of
                                'o': observed value only
                                'po': position of the agent + the above
                                'poa': the above + the action taken
        """

        assert board_size in list(config.keys()) and \
               num_rocks == len(config[board_size]["rock_pos"])

        self.num_rocks = num_rocks
        self._use_heuristic = use_heuristic

        self._rock_pos = \
            [Coord(*rock) for rock in config[board_size]['rock_pos']]
        self._agent_pos = Coord(*config[board_size]['init_pos'])
        self.grid = Grid(board_size, board_size)

        for idx, rock in enumerate(self._rock_pos):
            self.grid.board[rock] = idx

        self.action_space = Discrete(len(Action) + self.num_rocks)
        self._discount = .95
        self._reward_range = 20
        self._penalization = -100
        self._query = 0
        if stay_inside:
            self._out_of_bounds_penalty = 0
        else:
            self._out_of_bounds_penalty = self._penalization

        self.state = None
        self.last_action = None
        self.done = False

        self.gui = None

        assert observation in ['o', 'oa', 'po', 'poa']
        if observation == 'o':
            self._make_obs = lambda obs, a: obs
            self.observation_space = Discrete(len(Obs))
        elif observation == 'oa':
            self._make_obs = self._oa
            self.observation_space =\
                Box(low=0,
                    high=np.append(max(Obs), np.ones(self.action_space.n)),
                    dtype=np.int)

        elif observation == 'po':
            self._make_obs = self._po
            self.observation_space = \
                Box(low=0,
                    high=np.append(np.ones(self.grid.n_tiles), max(Obs)),
                    dtype=np.int)

        elif observation == 'poa':
            self._make_obs = self._poa
            self.observation_space = \
                Box(low=0,
                    high=np.concatenate((np.ones(self.grid.n_tiles),
                                         [max(Obs)],
                                        np.ones(self.action_space.n))),
                    dtype=np.int)
예제 #20
0
 def action_space(self) -> Discrete:
     """The discrete action space produced by the action scheme."""
     return Discrete(len(self.actions))
예제 #21
0
파일: base.py 프로젝트: Shushman/metaworld
class SawyerXYZEnv(SawyerMocapBase, metaclass=abc.ABCMeta):
    _HAND_SPACE = Box(np.array([-0.51, .38, -.05]), np.array([+0.51, 1.0,
                                                              .51]))

    def __init__(
        self,
        model_name,
        frame_skip=5,
        hand_low=(-0.2, 0.55, 0.05),
        hand_high=(0.2, 0.75, 0.3),
        mocap_low=None,
        mocap_high=None,
        action_scale=1. / 100,
        action_rot_scale=1.,
    ):
        super().__init__(model_name, frame_skip=frame_skip)
        self.random_init = True
        self.action_scale = action_scale
        self.action_rot_scale = action_rot_scale
        self.hand_low = np.array(hand_low)
        self.hand_high = np.array(hand_high)
        if mocap_low is None:
            mocap_low = hand_low
        if mocap_high is None:
            mocap_high = hand_high
        self.mocap_low = np.hstack(mocap_low)
        self.mocap_high = np.hstack(mocap_high)
        self.curr_path_length = 0
        self._freeze_rand_vec = True
        self._last_rand_vec = None

        # We use continuous goal space by default and
        # can discretize the goal space by calling
        # the `discretize_goal_space` method.
        self.discrete_goal_space = None
        self.discrete_goals = []
        self.active_discrete_goal = None

        self.action_space = Box(
            np.array([-1, -1, -1, -1]),
            np.array([+1, +1, +1, +1]),
        )

        self._pos_obj_max_len = 6
        self._pos_obj_possible_lens = (3, 6)

        self._set_task_called = False
        self._partially_observable = True

        self._state_goal = None  # OVERRIDE ME
        self._random_reset_space = None  # OVERRIDE ME

    def _set_task_inner(self):
        # Doesn't absorb "extra" kwargs, to ensure nothing's missed.
        pass

    def set_task(self, task):
        self._set_task_called = True
        data = pickle.loads(task.data)
        assert isinstance(self, data['env_cls'])
        del data['env_cls']
        self._last_rand_vec = data['rand_vec']
        self._freeze_rand_vec = True
        self._last_rand_vec = data['rand_vec']
        del data['rand_vec']
        self._partially_observable = data['partially_observable']
        del data['partially_observable']
        self._set_task_inner(**data)

    def set_xyz_action(self, action):
        action = np.clip(action, -1, 1)
        pos_delta = action * self.action_scale
        new_mocap_pos = self.data.mocap_pos + pos_delta[None]

        new_mocap_pos[0, :] = np.clip(
            new_mocap_pos[0, :],
            self.mocap_low,
            self.mocap_high,
        )
        self.data.set_mocap_pos('mocap', new_mocap_pos)
        self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def discretize_goal_space(self, goals):
        assert False
        assert len(goals) >= 1
        self.discrete_goals = goals
        # update the goal_space to a Discrete space
        self.discrete_goal_space = Discrete(len(self.discrete_goals))

    # Belows are methods for using the new wrappers.
    # `sample_goals` is implmented across the sawyer_xyz
    # as sampling from the task lists. This will be done
    # with the new `discrete_goals`. After all the algorithms
    # conform to this API (i.e. using the new wrapper), we can
    # just remove the underscore in all method signature.
    def sample_goals_(self, batch_size):
        assert False
        if self.discrete_goal_space is not None:
            return [
                self.discrete_goal_space.sample() for _ in range(batch_size)
            ]
        else:
            return [self.goal_space.sample() for _ in range(batch_size)]

    def set_goal_(self, goal):
        assert False
        if self.discrete_goal_space is not None:
            self.active_discrete_goal = goal
            self.goal = self.discrete_goals[goal]
            self._state_goal_idx = np.zeros(len(self.discrete_goals))
            self._state_goal_idx[goal] = 1.
        else:
            self.goal = goal

    def _set_obj_xyz(self, pos):
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[9:12] = pos.copy()
        qvel[9:15] = 0
        self.set_state(qpos, qvel)

    def get_site_pos(self, siteName):
        _id = self.model.site_names.index(siteName)
        return self.data.site_xpos[_id].copy()

    def _get_pos_objects(self):
        """Retrieves object position(s) from mujoco properties or instance vars

        Returns:
            np.ndarray: Flat array (usually 3 elements) representing the
                object(s)' position(s)
        """
        # Throw error rather than making this an @abc.abstractmethod so that
        # V1 environments don't have to implement it
        raise NotImplementedError

    def _get_pos_goal(self):
        """Retrieves goal position from mujoco properties or instance vars

        Returns:
            np.ndarray: Flat array (3 elements) representing the goal position
        """
        assert isinstance(self._state_goal, np.ndarray)
        assert self._state_goal.ndim == 1
        return self._state_goal

    def _get_obs(self):
        """Combines positions of the end effector, object(s) and goal into a
        single flat observation

        Returns:
            np.ndarray: The flat observation array (12 elements)
        """
        pos_hand = self.get_endeff_pos()

        pos_obj_padded = np.zeros(self._pos_obj_max_len)
        pos_obj = self._get_pos_objects()
        assert len(pos_obj) in self._pos_obj_possible_lens
        pos_obj_padded[:len(pos_obj)] = pos_obj

        pos_goal = self._get_pos_goal()
        if self._partially_observable:
            pos_goal = np.zeros_like(pos_goal)

        return np.hstack((pos_hand, pos_obj_padded, pos_goal))

    def _get_obs_dict(self):
        obs = self._get_obs()
        return dict(
            state_observation=obs,
            state_desired_goal=self._get_pos_goal(),
            state_achieved_goal=obs[3:-3],
        )

    @property
    def observation_space(self):
        obj_low = np.full(6, -np.inf)
        obj_high = np.full(6, +np.inf)
        return Box(
            np.hstack((self._HAND_SPACE.low, obj_low, self.goal_space.low)),
            np.hstack((self._HAND_SPACE.high, obj_high, self.goal_space.high)))

    def reset(self):
        self.curr_path_length = 0
        return super().reset()

    def _get_state_rand_vec(self):
        if self._freeze_rand_vec:
            assert self._last_rand_vec is not None
            return self._last_rand_vec
        else:
            rand_vec = np.random.uniform(
                self._random_reset_space.low,
                self._random_reset_space.high,
                size=self._random_reset_space.low.size)
            self._last_rand_vec = rand_vec
            return rand_vec
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        super(AutoregressiveActionsModel, self).__init__(
            obs_space, action_space, num_outputs, model_config, name)
        if action_space != Tuple([Discrete(2), Discrete(2)]):
            raise ValueError(
                "This model only supports the [2, 2] action space")

        # Inputs
        obs_input = tf.keras.layers.Input(
            shape=obs_space.shape, name="obs_input")
        a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input")
        ctx_input = tf.keras.layers.Input(
            shape=(num_outputs, ), name="ctx_input")

        # Output of the model (normally 'logits', but for an autoregressive
        # dist this is more like a context/feature layer encoding the obs)
        context = tf.keras.layers.Dense(
            num_outputs,
            name="hidden",
            activation=tf.nn.tanh,
            kernel_initializer=normc_initializer(1.0))(obs_input)

        # V(s)
        value_out = tf.keras.layers.Dense(
            1,
            name="value_out",
            activation=None,
            kernel_initializer=normc_initializer(0.01))(context)

        # P(a1 | obs)
        a1_logits = tf.keras.layers.Dense(
            2,
            name="a1_logits",
            activation=None,
            kernel_initializer=normc_initializer(0.01))(ctx_input)

        # P(a2 | a1)
        # --note: typically you'd want to implement P(a2 | a1, obs) as follows:
        # a2_context = tf.keras.layers.Concatenate(axis=1)(
        #     [ctx_input, a1_input])
        a2_context = a1_input
        a2_hidden = tf.keras.layers.Dense(
            16,
            name="a2_hidden",
            activation=tf.nn.tanh,
            kernel_initializer=normc_initializer(1.0))(a2_context)
        a2_logits = tf.keras.layers.Dense(
            2,
            name="a2_logits",
            activation=None,
            kernel_initializer=normc_initializer(0.01))(a2_hidden)

        # Base layers
        self.base_model = tf.keras.Model(obs_input, [context, value_out])
        self.register_variables(self.base_model.variables)
        self.base_model.summary()

        # Autoregressive action sampler
        self.action_model = tf.keras.Model([ctx_input, a1_input],
                                           [a1_logits, a2_logits])
        self.action_model.summary()
        self.register_variables(self.action_model.variables)
예제 #23
0
import unittest
import traceback

import gym
from gym.spaces import Box, Discrete, Tuple
from gym.envs.registration import EnvSpec

import ray
from ray.rllib.agent import get_agent_class
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.tune.registry import register_env

ACTION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
    "vector": Box(0.0, 1.0, (5, )),
    "simple_tuple": Tuple([Box(0.0, 1.0, (5, )),
                           Box(0.0, 1.0, (5, ))]),
    "implicit_tuple": [Box(0.0, 1.0, (5, )),
                       Box(0.0, 1.0, (5, ))],
}

OBSERVATION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
    "vector": Box(0.0, 1.0, (5, )),
    "image": Box(0.0, 1.0, (80, 80, 1)),
    "atari": Box(0.0, 1.0, (210, 160, 3)),
    "atari_ram": Box(0.0, 1.0, (128, )),
    "simple_tuple": Tuple([Box(0.0, 1.0, (5, )),
                           Box(0.0, 1.0, (5, ))]),
    "mixed_tuple": Tuple([Discrete(10), Box(0.0, 1.0, (5, ))]),
}
 def __init__(self, _):
     self.observation_space = Discrete(2)
     self.action_space = Tuple([Discrete(2), Discrete(2)])
예제 #25
0
parser = argparse.ArgumentParser()
parser.add_argument(
    "--framework",
    choices=["tf", "tf2", "tfe", "torch"],
    default="tf",
    help="The DL framework specifier.",
)

if __name__ == "__main__":
    args = parser.parse_args()

    # Test API wrapper for dueling Q-head.

    obs_space = Box(-1.0, 1.0, (3, ))
    action_space = Discrete(3)

    # Run in eager mode for value checking and debugging.
    tf1.enable_eager_execution()

    # __sphinx_doc_model_construct_1_begin__
    my_dueling_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.n,
        model_config=MODEL_DEFAULTS,
        framework=args.framework,
        # Providing the `model_interface` arg will make the factory
        # wrap the chosen default model with our new model API class
        # (DuelingQModel). This way, both `forward` and `get_q_values`
        # are available in the returned class.
예제 #26
0
파일: rock.py 프로젝트: d3sm0/gym_pomdp
class RockEnv(Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self, board_size=7, num_rocks=8, use_heuristic=False):

        assert board_size in list(
            config.keys()) and num_rocks in config[board_size]['size']

        self.num_rocks = num_rocks
        self._use_heuristic = use_heuristic

        self._rock_pos = [
            Coord(*rock) for rock in config[board_size]['rock_pos']
        ]
        self._agent_pos = Coord(*config[board_size]['init_pos'])
        self.grid = Grid(board_size, board_size)

        for idx, rock in enumerate(self._rock_pos):
            self.grid.board[rock] = idx

        self.action_space = Discrete(len(Action) + self.num_rocks)
        self.observation_space = Discrete(len(Obs))
        self._discount = .95
        self._reward_range = 20
        self._penalization = -100
        self._query = 0

    def seed(self, seed=None):
        np.random.seed(seed)

    def step(self, action):

        assert self.action_space.contains(action)
        assert self.done is False

        self.last_action = action
        self._query += 1

        reward = 0
        ob = Obs.NULL.value

        if action < Action.SAMPLE.value:
            if action == Action.EAST.value:
                if self.state.agent_pos.x + 1 < self.grid.x_size:
                    self.state.agent_pos += Moves.EAST.value
                else:
                    reward = 10
                    self.done = True
                    return ob, reward, self.done, {
                        "state": self._encode_state(self.state)
                    }
            elif action == Action.NORTH.value:
                if self.state.agent_pos.y + 1 < self.grid.y_size:
                    self.state.agent_pos += Moves.NORTH.value
                else:
                    reward = self._penalization
            elif action == Action.SOUTH.value:
                if self.state.agent_pos.y - 1 >= 0:
                    self.state.agent_pos += Moves.SOUTH.value
                else:
                    reward = self._penalization
            elif action == Action.WEST.value:
                if self.state.agent_pos.x - 1 >= 0:
                    self.state.agent_pos += Moves.WEST.value
                else:
                    reward = self._penalization
            else:
                raise NotImplementedError()

        if action == Action.SAMPLE.value:
            rock = self.grid[self.state.agent_pos]
            if rock >= 0 and not self.state.rocks[
                    rock].status == 0:  # collected
                if self.state.rocks[rock].status == 1:
                    reward = 10
                else:
                    reward = -10
                self.state.rocks[rock].status = 0
            else:
                reward = self._penalization

        if action > Action.SAMPLE.value:
            rock = action - Action.SAMPLE.value - 1
            assert rock < self.num_rocks

            ob = self._sample_ob(self.state.agent_pos, self.state.rocks[rock])

            self.state.rocks[rock].measured += 1

            eff = self._efficiency(self.state.agent_pos,
                                   self.state.rocks[rock].pos)

            if ob == Obs.GOOD.value:
                self.state.rocks[rock].count += 1
                self.state.rocks[rock].lkv *= eff
                self.state.rocks[rock].lkw *= (1 - eff)
            else:
                self.state.rocks[rock].count -= 1
                self.state.rocks[rock].lkw *= eff
                self.state.rocks[rock].lkv *= (1 - eff)

            denom = (.5 * self.state.rocks[rock].lkv) + (
                .5 * self.state.rocks[rock].lkw)
            self.state.rocks[rock].prob_valuable = (
                .5 * self.state.rocks[rock].lkv) / denom

        self.done = self._penalization == reward
        return ob, reward, self.done, {"state": self._encode_state(self.state)}

    def _decode_state(self, state, as_array=False):

        agent_pos = Coord(*state['agent_pos'])
        rock_state = RockState(agent_pos)
        for r in state['rocks']:
            rock = Rock(pos=0)
            rock.__dict__.update(r)
            rock_state.rocks.append(rock)

        if as_array:
            rocks = []
            for rock in rock_state.rocks:
                rocks.append(rock.status)

            return np.concatenate([[self.grid.get_index(agent_pos)], rocks])

        return rock_state

    def _encode_state(self, state):
        # use dictionary for state encodign

        return _encode_dict(state)
        # rocks can take 3 values: -1, 1, 0 if collected

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == "human":
            if not hasattr(self, "gui"):
                start_pos = self.grid.get_index(self.state.agent_pos)
                obj_pos = [(self.grid.get_index(rock.pos), rock.status)
                           for rock in self.state.rocks]
                self.gui = RockGui((self.grid.x_size, self.grid.y_size),
                                   start_pos=start_pos,
                                   obj=obj_pos)

            if self.last_action > Action.SAMPLE.value:
                rock = self.last_action - Action.SAMPLE.value - 1
                print("Rock S: {} P:{}".format(self.state.rocks[rock].status,
                                               self.state.rocks[rock].pos))
            # msg = "Action : " + action_to_str(self.last_action) + " Step: " + str(self.t) + " Rw: " + str(self.total_rw)
            agent_pos = self.grid.get_index(self.state.agent_pos)
            self.gui.render(agent_pos)

    def reset(self):
        self.done = False
        self._query = 0
        self.last_action = Action.SAMPLE.value
        self.state = self._get_init_state(should_encode=False)
        return Obs.NULL.value

    def _set_state(self, state):
        self.done = False
        self.state = self._decode_state(state)

    def close(self):
        self.render(close=True)

    def _compute_prob(self, action, next_state, ob):

        next_state = self._decode_state(next_state)

        if action <= Action.SAMPLE.value:
            return int(ob == Obs.NULL.value)

        eff = self._efficiency(
            next_state.agent_pos,
            next_state.rocks[action - Action.SAMPLE.value - 1].pos)

        if ob == Obs.GOOD.value and next_state.rocks[action -
                                                     Action.SAMPLE.value -
                                                     1].status == 1:
            return eff
        elif ob == Obs.BAD.value and next_state.rocks[action -
                                                      Action.SAMPLE.value -
                                                      1].status == -1:
            return eff
        else:
            return 1 - eff

    def _get_init_state(self, should_encode=True):

        rock_state = RockState(self._agent_pos)
        for idx in range(self.num_rocks):
            rock_state.rocks.append(Rock(self._rock_pos[idx]))
        return self._encode_state(rock_state) if should_encode else rock_state

    def _generate_legal(self):
        legal = [Action.EAST.value]  # can always go east
        if self.state.agent_pos.y + 1 < self.grid.y_size:
            legal.append(Action.NORTH.value)

        if self.state.agent_pos.y - 1 >= 0:
            legal.append(Action.SOUTH.value)
        if self.state.agent_pos.x - 1 >= 0:
            legal.append(Action.WEST.value)

        rock = self.grid[self.state.agent_pos]
        if rock >= 0 and self.state.rocks[rock].status != 0:
            legal.append(Action.SAMPLE.value)

        for rock in self.state.rocks:
            assert self.grid[rock.pos] != -1
            if rock.status != 0:
                legal.append(self.grid[rock.pos] + 1 + Action.SAMPLE.value)
        return legal

    def _generate_preferred(self, history):
        if not self._use_heuristic:
            return self._generate_legal()

        actions = []

        # sample rocks with high likelihood of being good
        rock = self.grid[self.state.agent_pos]
        if rock >= 0 and self.state.rocks[rock].status != 0 and history.size:
            total = 0
            for transition in history:
                if transition.action == rock + 1 + Action.SAMPLE.value:
                    if transition.next_observation == Obs.GOOD.value:
                        total += 1
                    elif transition.next_observation == Obs.BAD.value:
                        total -= 1
            if total > 0:
                actions.append(Action.SAMPLE.value)
                return actions

        # process the rocks

        all_bad = True
        direction = {
            "north": False,
            "south": False,
            "west": False,
            "east": False
        }
        for idx in range(self.num_rocks):
            rock = self.state.rocks[idx]
            if rock.status != 0:
                total = 0
                for transition in history:
                    if transition.action == idx + 1 + Action.SAMPLE.value:
                        if transition.next_observation == Obs.GOOD.value:
                            total += 1
                        elif transition.observation == Obs.BAD.value:
                            total -= 1
                if total >= 0:
                    all_bad = False

                    if rock.pos.y > self.state.agent_pos.y:
                        direction['north'] = True
                    elif rock.pos.y < self.state.agent_pos.y:
                        direction['south'] = True
                    elif rock.pos.x < self.state.agent_pos.x:
                        direction['west'] = True
                    elif rock.pos.x > self.state.agent_pos.x:
                        direction['east'] = True

        if all_bad:
            actions.append(Action.EAST.value)
            return actions

        # generate a random legal move
        # do not measure a collected rock
        # do no measure a rock too often
        # do not measure clearly bad rocks
        # don't move in a direction that puts you closer to bad rocks
        # never sample a rock

        if self.state.agent_pos.y + 1 < self.grid.y_size and direction['north']:
            actions.append(Action.NORTH.value)

        if direction['east']:
            actions.append(Action.EAST.value)

        if self.state.agent_pos.y - 1 >= 0 and direction['south']:
            actions.append(Action.SOUTH.value)

        if self.state.agent_pos.x - 1 >= 0 and direction['west']:
            actions.append(Action.WEST.value)

        for idx, rock in enumerate(self.state.rocks):
            if not rock.status == 0 and rock.measured < 5 and abs(
                    rock.count) < 2 and 0 < rock.prob_valuable < 1:
                actions.append(idx + 1 + Action.SAMPLE.value)

        if len(actions) == 0:
            return self._generate_legal()

        return actions

    def __dict2np__(self, state):
        idx = self.grid.get_index(Coord(*state['agent_pos']))
        rocks = []
        for rock in state['rocks']:
            rocks.append(rock['status'])
        return np.concatenate([[idx], rocks])

    @staticmethod
    def _efficiency(agent_pos, rock_pos, hed=20):
        d = Grid.euclidean_distance(agent_pos, rock_pos)
        eff = (1 + pow(2, -d / hed)) * .5
        return eff

    @staticmethod
    def _select_target(rock_state, x_size):
        best_dist = x_size * 2
        best_rock = -1  # Coord(-1, -1)
        for idx, rock in enumerate(rock_state.rocks):
            if rock.status != 0 and rock.count >= 0:
                d = Grid.manhattan_distance(rock_state.agent_pos, rock.pos)
                if d < best_dist:
                    best_dist = d
                    best_rock = idx  # rock.pos
        return best_rock

    @staticmethod
    def _sample_ob(agent_pos, rock, hed=20):
        eff = RockEnv._efficiency(agent_pos, rock.pos, hed=hed)
        if np.random.binomial(1, eff):
            return Obs.GOOD.value if rock.status == 1 else Obs.BAD.value
        else:
            return Obs.BAD.value if rock.status == 1 else Obs.GOOD.value
예제 #27
0
from collections import OrderedDict

import numpy as np
import pytest

from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete, Tuple, utils

spaces = [
    Discrete(3),
    Box(low=0.0, high=np.inf, shape=(2, 2)),
    Box(low=0.0, high=np.inf, shape=(2, 2), dtype=np.float16),
    Tuple([Discrete(5), Discrete(10)]),
    Tuple([
        Discrete(5),
        Box(low=np.array([0.0, 0.0]),
            high=np.array([1.0, 5.0]),
            dtype=np.float64),
    ]),
    Tuple((Discrete(5), Discrete(2), Discrete(2))),
    MultiDiscrete([2, 2, 10]),
    MultiBinary(10),
    Dict({
        "position":
        Discrete(5),
        "velocity":
        Box(low=np.array([0.0, 0.0]),
            high=np.array([1.0, 5.0]),
            dtype=np.float64),
    }),
    Discrete(3, start=2),
    Discrete(8, start=-5),
예제 #28
0
 def __init__(self, config):
     self.end_pos = config["corridor_length"]
     self.cur_pos = 0
     self.action_space = Discrete(2)
     self.observation_space = Box(
         0.0, self.end_pos, shape=(1, ), dtype=np.float32)
예제 #29
0
    def __init__(self, config: Config) -> None:
        spaces = {
            get_default_config().GOAL_SENSOR_UUID: Box(
                low=np.finfo(np.float32).min,
                high=np.finfo(np.float32).max,
                shape=(2,),
                dtype=np.float32,
            )
        }

        if config.INPUT_TYPE in ["depth", "rgbd"]:
            spaces["depth"] = Box(
                low=0,
                high=1,
                shape=(config.RESOLUTION, config.RESOLUTION, 1),
                dtype=np.float32,
            )

        if config.INPUT_TYPE in ["rgb", "rgbd"]:
            spaces["rgb"] = Box(
                low=0,
                high=255,
                shape=(config.RESOLUTION, config.RESOLUTION, 3),
                dtype=np.uint8,
            )
        observation_spaces = SpaceDict(spaces)

        action_spaces = Discrete(4)

        self.device = (
            torch.device("cuda:{}".format(config.PTH_GPU_ID))
            if torch.cuda.is_available()
            else torch.device("cpu")
        )
        self.hidden_size = config.HIDDEN_SIZE

        random.seed(config.RANDOM_SEED)
        torch.random.manual_seed(config.RANDOM_SEED)
        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True  # type: ignore

        self.actor_critic = PointNavBaselinePolicy(
            observation_space=observation_spaces,
            action_space=action_spaces,
            hidden_size=self.hidden_size,
        )
        self.actor_critic.to(self.device)

        if config.MODEL_PATH:
            ckpt = torch.load(config.MODEL_PATH, map_location=self.device)
            #  Filter only actor_critic weights
            self.actor_critic.load_state_dict(
                {
                    k[len("actor_critic.") :]: v
                    for k, v in ckpt["state_dict"].items()
                    if "actor_critic" in k
                }
            )

        else:
            habitat.logger.error(
                "Model checkpoint wasn't loaded, evaluating " "a random model."
            )

        self.test_recurrent_hidden_states: Optional[torch.Tensor] = None
        self.not_done_masks: Optional[torch.Tensor] = None
        self.prev_actions: Optional[torch.Tensor] = None
예제 #30
0
    def __init__(self,
                 seed,
                 game_config,
                 render=False,
                 use_depth=False,
                 use_rgb=True,
                 reward_scale=1,
                 frame_skip=4,
                 jitter_rgb=False,
                 noise_var=0.2,
                 drop_input_prob=0.0,
                 rotate_sensor=False,
                 rotate_range=30,
                 drop_input_freq=3,
                 flicker_freq=1):
        # assign observation space
        self.use_rgb = use_rgb
        self.use_depth = use_depth
        channel_num = 0
        if use_depth:
            channel_num = channel_num + 1
        if use_rgb:
            channel_num = channel_num + 3

        self.observation_shape = (channel_num, 84, 84)
        self.observation_space = Box(low=0,
                                     high=255,
                                     shape=self.observation_shape)

        self.reward_scale = reward_scale

        self.jitter_rgb = jitter_rgb
        self.noise_var = noise_var
        self.drop_input_prob = drop_input_prob
        self.drop_input_freq = drop_input_freq
        self.flicker_freq = flicker_freq
        self.prepare_drop_input()
        self.rotate_sensor = rotate_sensor
        self.rotate_range = rotate_range

        game = vzd.DoomGame()

        game.load_config(game_config)

        # game input setup
        game.set_screen_resolution(vzd.ScreenResolution.RES_160X120)
        game.set_screen_format(vzd.ScreenFormat.CRCGCB)
        if use_depth:
            game.set_depth_buffer_enabled(True)

        # Adds buttons that will be allowed.
        num_buttons = game.get_available_buttons_size()
        self.action_space = Discrete(num_buttons)
        actions = [([False] * num_buttons) for i in range(num_buttons)]
        for i in range(num_buttons):
            actions[i][i] = True
        self.actions = actions
        # set frame skip for taking action
        self.frame_skip = frame_skip

        game.set_seed(seed)
        random.seed(seed)
        game.set_window_visible(render)
        game.init()

        self.game = game
예제 #31
0
 def _configure(self):
     self._load_activities()
     self.action_space = Discrete(len(self.activities))
     self.observation_space = Box(0, 1, len(self.knowledges)) #
     self.simulator = StudentSimulator()
예제 #32
0
 def test_action_space(self):
     """Test action spaces."""
     assert self.env.action_space == Discrete(2)
예제 #33
0
 def test_observation_space(self):
     """Test observation spaces."""
     expected_size = len(WEATHER) * len(CAR_CONDITION) * len(ROAD_STATE)
     assert self.env.observation_space == Discrete(expected_size)
예제 #34
0
if __name__ == "__main__":
    ray.init(local_mode=True)
    args = parser.parse_args()

    ModelCatalog.register_custom_model(
        "cc_model",
        TorchCentralizedCriticModel if args.torch else CentralizedCriticModel)

    config = {
        "env": TwoStepGame,
        "batch_mode": "complete_episodes",
        "num_workers": 0,
        "multiagent": {
            "policies": {
                "pol1": (None, Discrete(6), TwoStepGame.action_space, {
                    "framework": "torch" if args.torch else "tf",
                }),
                "pol2": (None, Discrete(6), TwoStepGame.action_space, {
                    "framework": "torch" if args.torch else "tf",
                }),
            },
            "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
        },
        "model": {
            "custom_model": "cc_model",
        },
        "framework": "torch" if args.torch else "tf",
    }

    stop = {