def __init__(self,metadata): self.gridworldwindow = GridWorldWindow(metadata=metadata) self.mdp = GridMDP(metadata=metadata) self.gridworldwindow.btn_value_iteration_1_step.configure(command=self._value_iteration_1_step) self.gridworldwindow.btn_value_iteration_100_steps.configure(command=self._value_iteration_100_steps) self.gridworldwindow.btn_value_iteration_slow.configure(command=self._value_iteration_slow) self.gridworldwindow.btn_policy_iteration_1_step.configure(command=self._policy_iteration_1_step) self.gridworldwindow.btn_policy_iteration_100_steps.configure(command=self._policy_iteration_100_steps) self.gridworldwindow.btn_policy_iteration_slow.configure(command=self._policy_iteration_slow) self.gridworldwindow.btn_reset.configure(command=self._reset_grid)
def setup_mdp(): """ Setup MDP with diagonal grid actions. :return: MDP object """ grid_actions = GridMDP.DIAGONAL_ACTIONS reward_function = setup_reward_function(grid_actions) grid_mdp = GridMDP(grid_actions, reward_function) return grid_mdp
def setup_mdp(grid_actions, reward_function): """ Setup MDP with diagonal grid actions. :param grid_actions: MDP actions :param reward_function: mdp reward function :return: MDP object """ grid_mdp = GridMDP(grid_actions, reward_function) return grid_mdp
def __init__(self, reward_shape, grid_actions, name="path_dev_reward", viz=True): """ Initialize path deviation reward component :param reward_shape: distance reward shape for path deviation reward :param grid_actions: MDP actions :param name: reward component name, for debugging :param viz: whether the reward component should be included for visualization """ RewardComponent.__init__(self, name, viz) # MDP for calculating direct path policy reward_direct_path = RewardFunction() reward_direct_path.add_component(ObstacleReward()) reward_direct_path.add_component( StepReward(-1.0, [np.float('-inf'), 0], "direct_path_step")) self.direct_path_client = GridMDP(grid_actions, reward_direct_path) # MDP for calculating path distance based on direct path policy reward_path_dist = RewardFunction() self.step_weight = -1.0 reward_path_dist.add_component( StepReward(self.step_weight, [np.float('-inf'), 0], "path_dist_step")) self.path_dist_client = GridMDP(GridMDP.DIAGONAL_ACTIONS, reward_path_dist) # reward shape for distance to path self.dist_reward = reward_shape self.params = self.dist_reward.params self.valid_params_ranges = self.dist_reward.valid_params_ranges self.direct_path_pub = rospy.Publisher("direct_path_cells", OccupancyGrid, queue_size=1)
class ViewController(object): def __init__(self, metadata): self.gridworld = GridWorldWindow(metadata=metadata) self.mdp = GridMDP(metadata=metadata) # bind buttons self.gridworld.btn_value_iteration_1_step.configure( command=self._value_iteration_1_step) self.gridworld.btn_value_iteration_100_steps.configure( command=self._value_iteration_100_steps) self.gridworld.btn_value_iteration_slow.configure( command=self._value_iteration_slow) self.gridworld.btn_policy_iteration_1_step.configure( command=self._policy_iteration_1_step) self.gridworld.btn_policy_iteration_100_steps.configure( command=self._policy_iteration_100_steps) self.gridworld.btn_policy_iteration_slow.configure( command=self._policy_iteration_slow) self.gridworld.btn_reset.configure(command=self._reset_grid) def _value_iteration_1_step(self): values = value_iteration(self.mdp.values, self.mdp, num_iter=1) policy = policy_extraction(values, self.mdp) self.gridworld.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policy(policy) def _value_iteration_100_steps(self): values = value_iteration(self.mdp.values, self.mdp, num_iter=100) policy = policy_extraction(values, self.mdp) self.gridworld.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policy(policy) def _value_iteration_slow(self): # run one iteration of value iteration at a time old_values = dict(self.mdp.values) for i in range(100): values = value_iteration(self.mdp.values, self.mdp, num_iter=1) policy = policy_extraction(values, self.mdp) self.gridworld.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policy(policy) self.gridworld.window.update() time.sleep(0.25) self.gridworld.window.update() new_values = dict(values) if values_converged(new_values, old_values): break old_values = new_values self.gridworld.show_dialog( 'Value Iteration has converged in {} steps!'.format(i + 1)) def _policy_iteration_1_step(self): policy, values = policy_iteration(self.mdp.policy, self.mdp, num_iter=1) self.gridworld.update_grid(values, policy) self.mdp.update_values(values) self.mdp.update_policy(policy) def _policy_iteration_100_steps(self): policy_iteration(self.mdp, num_iter=100) self.gridworld.update_grid(self.mdp.values, self.mdp.policy) def _policy_iteration_slow(self): # run one iteration of policy iteration at a time old_policy = dict(self.mdp.policy) for i in range(100): policy_iteration(self.mdp, num_iter=1) self.gridworld.update_grid(self.mdp.values, self.mdp.policy) self.gridworld.window.update() time.sleep(0.25) self.gridworld.window.update() new_policy = dict(self.mdp.policy) if policy_converged(new_policy, old_policy): break old_policy = new_policy self.gridworld.show_dialog( 'Policy Iteration has converged in {} steps!'.format(i + 1)) def _reset_grid(self): self.mdp.clear() self.gridworld.clear() def run(self): # main UI loop self.gridworld.run()
table = [header] + table table = [[(numfmt % x if isnumber(x) else x) for x in row] for row in table] maxlen = lambda seq: max(map(len, seq)) sizes = map(maxlen, zip(*[map(str, row) for row in table])) for row in table: print sep.join(getattr(str(x), j)(size) for (j, size, x) in zip(justs, sizes, row)) prize = 1 trap = -1 neg = -0.4 mdp1 = GridMDP([[neg, trap, prize], [neg, None, neg], [neg, neg, neg]], terminals=[(1, 2), (2, 2)], error=.8) print "GRID" print print "Value iteration" pi = best_policy(mdp1, value_iteration(mdp1, .01)) print_table(mdp1.to_arrows(pi)) print "Policy iteration" print_table(mdp1.to_arrows(policy_iteration(mdp1))) print "Q Learning" pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000)) print_table(mdp1.to_arrows(pi))
class PathDeviationReward(RewardComponent): """ Calculate reward for deviating from the direct path between a start and a goal cell that takes only static obstacles into account. Since multiple paths can be optimal in a grid, calculate path deviation reward based on the minimum distance to any of them, as described below in compute_path_dist. """ def __init__(self, reward_shape, grid_actions, name="path_dev_reward", viz=True): """ Initialize path deviation reward component :param reward_shape: distance reward shape for path deviation reward :param grid_actions: MDP actions :param name: reward component name, for debugging :param viz: whether the reward component should be included for visualization """ RewardComponent.__init__(self, name, viz) # MDP for calculating direct path policy reward_direct_path = RewardFunction() reward_direct_path.add_component(ObstacleReward()) reward_direct_path.add_component( StepReward(-1.0, [np.float('-inf'), 0], "direct_path_step")) self.direct_path_client = GridMDP(grid_actions, reward_direct_path) # MDP for calculating path distance based on direct path policy reward_path_dist = RewardFunction() self.step_weight = -1.0 reward_path_dist.add_component( StepReward(self.step_weight, [np.float('-inf'), 0], "path_dist_step")) self.path_dist_client = GridMDP(GridMDP.DIAGONAL_ACTIONS, reward_path_dist) # reward shape for distance to path self.dist_reward = reward_shape self.params = self.dist_reward.params self.valid_params_ranges = self.dist_reward.valid_params_ranges self.direct_path_pub = rospy.Publisher("direct_path_cells", OccupancyGrid, queue_size=1) def compute_path_dist(self, nav_map, start, goal): """ For every state, calculate the minimum distance to the direct path between start and goal, taking only static obstacles into account. :param nav_map: map of environment :param start: start cell :param goal: goal cell :return: distance map To solve the direct path ambiguity that multiple paths between start and goal can have minimal length, we first compute the optimal policy through MDP with static obstacles, using the path_dist_client mdp. Propagating the optimal policy from the start to the goal determines the states along all shortest paths, direct_path_cells. Afterwards, we perform a wavefront expansion from the direct_path_cells outwards to determine the distance of each state to any direct path. This is achieved by performing value iteration with the path_dist_client mdp with a step weight of -1.0 and treating the direct_path_cells as goal cells. Each state will have the minimum step reward to the direct path, and we can calculate the path distance with the step weight. """ # solve direct_path_client mdp for optimal policy values, q_values = \ self.direct_path_client.value_iteration(nav_map, [], start, goal, softmax=False) direct_path_policy = \ self.direct_path_client.compute_policy(values, q_values, stochastic=False) # propagate policy for states along direct paths direct_path_cells = self.direct_path_client.propagate_policy( nav_map, start, goal, direct_path_policy, viz=False) direct_path_cells[goal] = 1.0 # visualize direct path cells visualize_grid((direct_path_cells > 0) * 100, nav_map, rospy.Time.now(), self.direct_path_pub) # perform wavefront expansion from direct_path_cells outwards by solving # the path_dist_client mdp wavefront_start = direct_path_cells > 0 step_values, _ = \ self.path_dist_client.value_iteration(nav_map, [], None, wavefront_start, softmax=False) # convert step values to path distances path_dists = torch.from_numpy(self.step_weight * np.expand_dims(step_values, axis=2)) return path_dists def compute_state_reward(self, nav_map, people_pos, start, goal): """ Compute path deviation state reward. :param nav_map: map of environment :param people_pos: cell coordinates of people (unused) :param start: start cell :param goal: goal cell :return: path deviation reward """ direct_path_dist = self.compute_path_dist(nav_map, start, goal) deviation_reward = self.dist_reward.compute_reward(direct_path_dist) return deviation_reward