예제 #1
0
    def get_all_states(self):
        """Returns all states.

        """
        states = []
        for x in range(1, self.width + 1):
            for y in range(1, self.height + 1):
                state = NavigationWorldState(x, y)
                if self.is_goal(x, y) and self.is_goal_terminal:
                    state.set_terminal(True)
                states.append(state)
        return states
예제 #2
0
    def get_all_states(self):
        """Returns all states.

        """
        return [
            NavigationWorldState(x, y) for x in range(1, self.width + 1)
            for y in range(1, self.height + 1)
        ]
예제 #3
0
    def get_value_grid(self):
        """Returns value over states space grid.

        """
        value_iter = self.run_value_iteration()
        V = np.zeros((self.height, self.width), dtype=np.float32)
        for row in range(self.height):
            for col in range(self.width):
                x, y = self._rowcol_to_xy(row, col)
                V[row, col] = value_iter.value_func[NavigationWorldState(x, y)]
        return V
예제 #4
0
    def sample_traj_init_state(self, idx=None):
        """Samples trajectory init state (GridWorldState).

        Type of init cells to be sampled can be specified using
        set_traj_init_cell_types().
        """
        if idx is None:
            rand_idx = np.random.randint(len(self.traj_init_cell_row_idxs))
        else:
            assert 0 <= idx < len(self.traj_init_cell_row_idxs)
            rand_idx = idx

        x, y = self._rowcol_to_xy(self.traj_init_cell_row_idxs[rand_idx],
                                  self.traj_init_cell_col_idxs[rand_idx])
        return NavigationWorldState(x, y)
예제 #5
0
    def _transition_func(self, state, action):
        """
        Args:
            state (State)
            action (str)

        Returns
            (State)
        """
        if state.is_terminal():
            return state

        r = random.random()
        if self.slip_prob > r:
            # Flip dir.
            if action == "up":
                action = random.choice(["left", "right"])
            elif action == "down":
                action = random.choice(["left", "right"])
            elif action == "left":
                action = random.choice(["up", "down"])
            elif action == "right":
                action = random.choice(["up", "down"])

        if action == "up" and state.y < self.height and not self.is_wall(
                state.x, state.y + 1):
            next_state = NavigationWorldState(state.x, state.y + 1)
        elif action == "down" and state.y > 1 and not self.is_wall(
                state.x, state.y - 1):
            next_state = NavigationWorldState(state.x, state.y - 1)
        elif action == "right" and state.x < self.width and not self.is_wall(
                state.x + 1, state.y):
            next_state = NavigationWorldState(state.x + 1, state.y)
        elif action == "left" and state.x > 1 and not self.is_wall(state.x - 1,
                                                                   state.y):
            next_state = NavigationWorldState(state.x - 1, state.y)
        else:
            next_state = NavigationWorldState(state.x, state.y)

        if self.is_goal(next_state.x, next_state.y) and self.is_goal_terminal:
            next_state.set_terminal(True)

        return next_state
예제 #6
0
    def __init__(self,
                 width=30,
                 height=30,
                 nav_cell_types=["white", "yellow", "red", "green", "purple"],
                 nav_cell_rewards=[0, 0, -10, -10, -10],
                 nav_cell_p_or_locs=[0.68, 0.17, 0.05, 0.05, 0.05],
                 wall_cell_types=[],
                 wall_cell_rewards=[],
                 wall_cell_locs=[],
                 goal_cell_types=["blue"],
                 goal_cell_rewards=[1.],
                 goal_cell_locs=[[(21, 21)]],
                 init_loc=(1, 1),
                 rand_init=False,
                 gamma=0.99, slip_prob=0.00, step_cost=0.5,
                 is_goal_terminal=True,
                 name="Navigation MDP"):
        """Navigation World MDP constructor.

        Args:
            height (int): Navigation grid height (in no. of cells).
            width (int): Navigation grid width (in no. of cells).
            nav_cell_types (list of <str / int>): Navigation cell types.
            nav_cell_rewards (list of float): Rewards associated with
                @nav_cell_types.
            nav_cell_p_or_locs (list of <float /
            list of tuples (int x, int y)>):
                Probability corresponding to @nav_cell_types distribution, or
                it could be list of fixed/forced locations [(x,y),...].
                (Default values are chosen arbitrarily larger than percolation
                threshold for square lattice--just an approximation to match
                cell distribution in the paper.)
            goal_cell_types (list of <str / int>): Goal cell types.
            goal_cell_locs (list of list of tuples (int, int)): Goal cell
                locations [(x,y),...] associated with @goal_cell_types.
            nav_cell_rewards (list of float): Rewards associated with
                @goal_cell_types.
            init_loc (int x, int y): Init cell to compute state space
                reachability and value iteration.
            gamma (float): MDP discount factor
            slip_prob (float): With this probability agent could fall either
                on left or right from the intended cell.
            step_cost (float): Living penalty.
            is_goal_terminal (bool): True to set goal state terminal.
        Note:
            Locations are specified in (x,y) format, but (row, col)
            convention is used while storing in memory.
        """
        assert len(nav_cell_types) == len(nav_cell_rewards) \
            == len(nav_cell_p_or_locs)
        assert len(wall_cell_types) == len(wall_cell_rewards) \
            == len(wall_cell_locs)
        assert len(goal_cell_types) == len(goal_cell_rewards) \
            == len(goal_cell_locs)

        self.width = width
        self.height = height
        self.init_loc = init_loc
        self.rand_init = rand_init
        self.gamma = gamma
        self.slip_prob = slip_prob
        self.step_cost = step_cost
        self.is_goal_terminal = is_goal_terminal
        self.name = name
        self.goal_cell_locs = goal_cell_locs

        # Setup init location.
        self.rand_init = rand_init
        if rand_init:
            init_loc = random.randint(1, width), random.randint(1, height)
            while self.is_wall(*init_loc) or self.is_goal(*init_loc):
                init_loc = random.randint(1, width), random.randint(1, height)
        self.init_loc = init_loc
        # Construct base class
        MDP.__init__(self, NavigationWorldMDP.ACTIONS, self._transition_func,
                     self._reward_func,
                     init_state=NavigationWorldState(*init_loc),
                     gamma=gamma)

        # Navigation MDP
        self.__reset_nav_mdp()
        self.__register_cell_types(nav_cell_types, wall_cell_types,
                                   goal_cell_types)
        self.__add_cells_by_locs(self.goal_cell_ids, goal_cell_locs,
                                 NavigationWorldMDP.CELL_KIND_GOAL)
        self.__add_cells_by_locs(self.wall_cell_ids, wall_cell_locs,
                                 NavigationWorldMDP.CELL_KIND_WALL)
        self.__add_nav_cells(self.nav_cell_ids, nav_cell_p_or_locs,
                             NavigationWorldMDP.CELL_KIND_NAV)
        self.__check_state_map()

        self.__register_cell_rewards(nav_cell_rewards,
                                     wall_cell_rewards, goal_cell_rewards)

        # Initialize value iteration object (computes reachable states)
        self.value_iter = ValueIteration(self, sample_rate=1)
        self._policy_invalidated = True
예제 #7
0
    def sample_trajectories(self,
                            n_traj,
                            horizon,
                            init_states=None,
                            init_cell_types=None,
                            init_unique=False,
                            policy=None,
                            rand_init_to_match_n_traj=True):
        """Samples trajectories.

        Args:
            n_traj: Number of trajectories to sample.
            horizon (int): Planning horizon (max trajectory length).
            init_states:
                None - to use random init state
                [GridWorldState(x,y),...] - to use specific init states
            init_unique: When init_unique is set to False, this will sample
                every possible init state and try to not repeat init state
                unless @n_traj > @self.num_traj_init_states
            policy (fn): S->A
            rand_init_to_match_n_traj: If True, this will always return
                @n_traj many trajectories. If # of unique states are less
                than @n_traj, this will override the effect of @init_unique and
                sample repeated init states.
        Returns:
            (traj_states_list, traj_actions_list) where
                traj_states: [s1, s2, ..., sT]
                traj_actions: [a1, a2, ..., aT]
        """
        assert len(init_cell_types) >= 1

        self.set_traj_init_cell_types(init_cell_types)
        traj_states_list = []
        traj_action_list = []
        traj_init_states = []

        if init_states is None:
            # If no init_states are provided, sample n_traj many init states.
            traj_init_states = self.sample_init_states(n_traj,
                                                       init_unique=init_unique)
        else:
            if type(init_states[0]) == tuple or type(init_states[0]) == list:
                init_states = [NavigationWorldState(*s) for s in init_states]
            traj_init_states = copy.deepcopy(init_states)

        if len(traj_init_states) >= n_traj:
            traj_init_states = traj_init_states[:n_traj]
        elif rand_init_to_match_n_traj:
            # If # init_states < n_traj, sample remaining ones
            if len(traj_init_states) < n_traj:
                traj_init_states += self.sample_init_states(
                    n_traj - len(traj_init_states),
                    init_unique=init_unique,
                    skip_states=traj_init_states)

            # If rand_init_to_match_n_traj is set to True, sample more
            # init_states if needed (may not be unique)
            if rand_init_to_match_n_traj and len(traj_init_states) < n_traj:
                traj_init_states += self.sample_init_states(n_traj,
                                                            init_unique=False)

        if policy is None:
            if len(self.goal_cell_locs) == 0:
                print("Running value iteration with no goals assigned..")
            policy = self.run_value_iteration().policy

        for init_state in traj_init_states:
            action_seq, state_seq = self.policy_propagate(init_state,
                                                          policy=policy,
                                                          horizon=horizon)
            traj_states_list.append(state_seq)
            traj_action_list.append(action_seq)

        return traj_states_list, traj_action_list