예제 #1
0
    def __init__(self,metadata):
        self.gridworldwindow = GridWorldWindow(metadata=metadata)
        self.mdp = GridMDP(metadata=metadata)

        self.gridworldwindow.btn_value_iteration_1_step.configure(command=self._value_iteration_1_step)
        self.gridworldwindow.btn_value_iteration_100_steps.configure(command=self._value_iteration_100_steps)
        self.gridworldwindow.btn_value_iteration_slow.configure(command=self._value_iteration_slow)
        self.gridworldwindow.btn_policy_iteration_1_step.configure(command=self._policy_iteration_1_step)
        self.gridworldwindow.btn_policy_iteration_100_steps.configure(command=self._policy_iteration_100_steps)
        self.gridworldwindow.btn_policy_iteration_slow.configure(command=self._policy_iteration_slow)

        self.gridworldwindow.btn_reset.configure(command=self._reset_grid)
def setup_mdp():
    """
    Setup MDP with diagonal grid actions.
    :return: MDP object
    """
    grid_actions = GridMDP.DIAGONAL_ACTIONS
    reward_function = setup_reward_function(grid_actions)
    grid_mdp = GridMDP(grid_actions, reward_function)

    return grid_mdp
예제 #3
0
def setup_mdp(grid_actions, reward_function):
    """
    Setup MDP with diagonal grid actions.
    :param grid_actions: MDP actions
    :param reward_function: mdp reward function
    :return: MDP object
    """

    grid_mdp = GridMDP(grid_actions, reward_function)

    return grid_mdp
예제 #4
0
    def __init__(self,
                 reward_shape,
                 grid_actions,
                 name="path_dev_reward",
                 viz=True):
        """
        Initialize path deviation reward component
        :param reward_shape: distance reward shape for path deviation reward
        :param grid_actions: MDP actions
        :param name: reward component name, for debugging
        :param viz: whether the reward component should be included for visualization
        """

        RewardComponent.__init__(self, name, viz)

        # MDP for calculating direct path policy
        reward_direct_path = RewardFunction()
        reward_direct_path.add_component(ObstacleReward())
        reward_direct_path.add_component(
            StepReward(-1.0, [np.float('-inf'), 0], "direct_path_step"))
        self.direct_path_client = GridMDP(grid_actions, reward_direct_path)

        # MDP for calculating path distance based on direct path policy
        reward_path_dist = RewardFunction()
        self.step_weight = -1.0
        reward_path_dist.add_component(
            StepReward(self.step_weight, [np.float('-inf'), 0],
                       "path_dist_step"))
        self.path_dist_client = GridMDP(GridMDP.DIAGONAL_ACTIONS,
                                        reward_path_dist)

        # reward shape for distance to path
        self.dist_reward = reward_shape
        self.params = self.dist_reward.params
        self.valid_params_ranges = self.dist_reward.valid_params_ranges

        self.direct_path_pub = rospy.Publisher("direct_path_cells",
                                               OccupancyGrid,
                                               queue_size=1)
예제 #5
0
class ViewController(object):
    def __init__(self, metadata):
        self.gridworld = GridWorldWindow(metadata=metadata)
        self.mdp = GridMDP(metadata=metadata)

        # bind buttons
        self.gridworld.btn_value_iteration_1_step.configure(
            command=self._value_iteration_1_step)
        self.gridworld.btn_value_iteration_100_steps.configure(
            command=self._value_iteration_100_steps)
        self.gridworld.btn_value_iteration_slow.configure(
            command=self._value_iteration_slow)
        self.gridworld.btn_policy_iteration_1_step.configure(
            command=self._policy_iteration_1_step)
        self.gridworld.btn_policy_iteration_100_steps.configure(
            command=self._policy_iteration_100_steps)
        self.gridworld.btn_policy_iteration_slow.configure(
            command=self._policy_iteration_slow)

        self.gridworld.btn_reset.configure(command=self._reset_grid)

    def _value_iteration_1_step(self):
        values = value_iteration(self.mdp.values, self.mdp, num_iter=1)
        policy = policy_extraction(values, self.mdp)
        self.gridworld.update_grid(values, policy)
        self.mdp.update_values(values)
        self.mdp.update_policy(policy)

    def _value_iteration_100_steps(self):
        values = value_iteration(self.mdp.values, self.mdp, num_iter=100)
        policy = policy_extraction(values, self.mdp)
        self.gridworld.update_grid(values, policy)
        self.mdp.update_values(values)
        self.mdp.update_policy(policy)

    def _value_iteration_slow(self):
        # run one iteration of value iteration at a time
        old_values = dict(self.mdp.values)
        for i in range(100):
            values = value_iteration(self.mdp.values, self.mdp, num_iter=1)
            policy = policy_extraction(values, self.mdp)
            self.gridworld.update_grid(values, policy)
            self.mdp.update_values(values)
            self.mdp.update_policy(policy)

            self.gridworld.window.update()
            time.sleep(0.25)
            self.gridworld.window.update()

            new_values = dict(values)
            if values_converged(new_values, old_values):
                break

            old_values = new_values
        self.gridworld.show_dialog(
            'Value Iteration has converged in {} steps!'.format(i + 1))

    def _policy_iteration_1_step(self):
        policy, values = policy_iteration(self.mdp.policy,
                                          self.mdp,
                                          num_iter=1)
        self.gridworld.update_grid(values, policy)
        self.mdp.update_values(values)
        self.mdp.update_policy(policy)

    def _policy_iteration_100_steps(self):
        policy_iteration(self.mdp, num_iter=100)
        self.gridworld.update_grid(self.mdp.values, self.mdp.policy)

    def _policy_iteration_slow(self):
        # run one iteration of policy iteration at a time
        old_policy = dict(self.mdp.policy)
        for i in range(100):
            policy_iteration(self.mdp, num_iter=1)
            self.gridworld.update_grid(self.mdp.values, self.mdp.policy)
            self.gridworld.window.update()
            time.sleep(0.25)
            self.gridworld.window.update()

            new_policy = dict(self.mdp.policy)
            if policy_converged(new_policy, old_policy):
                break

            old_policy = new_policy
        self.gridworld.show_dialog(
            'Policy Iteration has converged in {} steps!'.format(i + 1))

    def _reset_grid(self):
        self.mdp.clear()
        self.gridworld.clear()

    def run(self):
        # main UI loop
        self.gridworld.run()
예제 #6
0
    table = [header] + table
  table = [[(numfmt % x if isnumber(x) else x) for x in row]
           for row in table]
  maxlen = lambda seq: max(map(len, seq))
  sizes = map(maxlen, zip(*[map(str, row) for row in table]))
  for row in table:
    print sep.join(getattr(str(x), j)(size)
                   for (j, size, x) in zip(justs, sizes, row))

prize = 1
trap = -1
neg = -0.4

mdp1 = GridMDP([[neg, trap, prize],
                [neg, None, neg],
                [neg,  neg, neg]],
                terminals=[(1, 2), (2, 2)],
                error=.8)

print "GRID"
print
print "Value iteration"
pi = best_policy(mdp1, value_iteration(mdp1, .01))
print_table(mdp1.to_arrows(pi))

print "Policy iteration"
print_table(mdp1.to_arrows(policy_iteration(mdp1)))

print "Q Learning"
pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000))
print_table(mdp1.to_arrows(pi))
예제 #7
0
class PathDeviationReward(RewardComponent):
    """
    Calculate reward for deviating from the direct path between a start and
    a goal cell that takes only static obstacles into account. Since multiple paths
    can be optimal in a grid, calculate path deviation reward based on the
    minimum distance to any of them, as described below in compute_path_dist.
    """
    def __init__(self,
                 reward_shape,
                 grid_actions,
                 name="path_dev_reward",
                 viz=True):
        """
        Initialize path deviation reward component
        :param reward_shape: distance reward shape for path deviation reward
        :param grid_actions: MDP actions
        :param name: reward component name, for debugging
        :param viz: whether the reward component should be included for visualization
        """

        RewardComponent.__init__(self, name, viz)

        # MDP for calculating direct path policy
        reward_direct_path = RewardFunction()
        reward_direct_path.add_component(ObstacleReward())
        reward_direct_path.add_component(
            StepReward(-1.0, [np.float('-inf'), 0], "direct_path_step"))
        self.direct_path_client = GridMDP(grid_actions, reward_direct_path)

        # MDP for calculating path distance based on direct path policy
        reward_path_dist = RewardFunction()
        self.step_weight = -1.0
        reward_path_dist.add_component(
            StepReward(self.step_weight, [np.float('-inf'), 0],
                       "path_dist_step"))
        self.path_dist_client = GridMDP(GridMDP.DIAGONAL_ACTIONS,
                                        reward_path_dist)

        # reward shape for distance to path
        self.dist_reward = reward_shape
        self.params = self.dist_reward.params
        self.valid_params_ranges = self.dist_reward.valid_params_ranges

        self.direct_path_pub = rospy.Publisher("direct_path_cells",
                                               OccupancyGrid,
                                               queue_size=1)

    def compute_path_dist(self, nav_map, start, goal):
        """
        For every state, calculate the minimum distance to the direct path
        between start and goal, taking only static obstacles into account.
        :param nav_map: map of environment
        :param start: start cell
        :param goal: goal cell
        :return: distance map

        To solve the direct path ambiguity that multiple paths between start and
        goal can have minimal length, we first compute the optimal policy
        through MDP with static obstacles, using the path_dist_client mdp.
        Propagating the optimal policy from the start to the goal determines the
        states along all shortest paths, direct_path_cells. Afterwards, we perform
        a wavefront expansion from the direct_path_cells outwards to determine
        the distance of each state to any direct path. This is achieved by
        performing value iteration with the path_dist_client mdp with a step
        weight of -1.0 and treating the direct_path_cells as goal cells. Each
        state will have the minimum step reward to the direct path, and we
        can calculate the path distance with the step weight.
        """

        # solve direct_path_client mdp for optimal policy
        values, q_values = \
            self.direct_path_client.value_iteration(nav_map, [], start, goal,
                                                    softmax=False)
        direct_path_policy = \
            self.direct_path_client.compute_policy(values, q_values, stochastic=False)

        # propagate policy for states along direct paths
        direct_path_cells = self.direct_path_client.propagate_policy(
            nav_map, start, goal, direct_path_policy, viz=False)
        direct_path_cells[goal] = 1.0

        # visualize direct path cells
        visualize_grid((direct_path_cells > 0) * 100, nav_map,
                       rospy.Time.now(), self.direct_path_pub)

        # perform wavefront expansion from direct_path_cells outwards by solving
        # the path_dist_client mdp
        wavefront_start = direct_path_cells > 0
        step_values, _ = \
            self.path_dist_client.value_iteration(nav_map, [], None, wavefront_start,
                                                  softmax=False)

        # convert step values to path distances
        path_dists = torch.from_numpy(self.step_weight *
                                      np.expand_dims(step_values, axis=2))

        return path_dists

    def compute_state_reward(self, nav_map, people_pos, start, goal):
        """
        Compute path deviation state reward.
        :param nav_map: map of environment
        :param people_pos: cell coordinates of people (unused)
        :param start: start cell
        :param goal: goal cell
        :return: path deviation reward
        """

        direct_path_dist = self.compute_path_dist(nav_map, start, goal)
        deviation_reward = self.dist_reward.compute_reward(direct_path_dist)

        return deviation_reward