Python GridworldMdp.GridworldMdp示例，gridworld.gridworld.GridworldMdp.GridworldMdp Python示例

示例#1

0

显示文件

    def test_uncalibrated_agents(self):
        grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
                ['X', -9, ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
                ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 3, 'X'],
                ['X', ' ', ' ', 'X', -9, -9, -9, -9, -9, ' ', 'X'],
                ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]
        n, s, e, w, stay = self.all_actions

        mdp = GridworldMdp(grid, living_reward=-0.1, noise=0.2)
        env = Mdp(mdp)

        agent1 = agents.OptimalAgent(gamma=0.9, num_iters=50)
        agent1.set_mdp(mdp)
        actions, _ = self.run_on_env(agent1, env, gamma=0.9, episode_length=13)
        self.assertEqual(actions,
                         [e, e, e, n, e, e, e, e, e, s, stay, stay, stay])

        agent2 = agents.UncalibratedAgent(gamma=0.9,
                                          num_iters=20,
                                          calibration_factor=5)
        agent2.set_mdp(mdp)
        actions, _ = self.run_on_env(agent2, env, gamma=0.9, episode_length=13)
        self.assertEqual(
            actions, [e, e, e, e, e, e, e, e, stay, stay, stay, stay, stay])

        agent3 = agents.UncalibratedAgent(gamma=0.9,
                                          num_iters=20,
                                          calibration_factor=0.5)
        agent3.set_mdp(mdp)
        actions, _ = self.run_on_env(agent3, env, gamma=0.9, episode_length=13)
        self.assertEqual(actions, [s, e, n, e, e, n, e, e, e, e, e, s, stay])

示例#2

0

显示文件

    def test_myopic_agent(self):
        grid = [
            'XXXXXXXX', 'XA     X', 'X XXXX9X', 'X      X', 'X X2   X',
            'XXXXXXXX'
        ]
        n, s, e, w, stay = self.all_actions

        mdp = GridworldMdp(grid, living_reward=-0.1)
        env = Mdp(mdp)

        optimal_agent = agents.OptimalAgent(gamma=0.9, num_iters=20)
        optimal_agent.set_mdp(mdp)
        actions, _ = self.run_on_env(optimal_agent,
                                     env,
                                     gamma=0.9,
                                     episode_length=10)
        self.assertEqual(actions, [e, e, e, e, e, s, stay, stay, stay, stay])

        myopic_agent = agents.MyopicAgent(6, gamma=0.9, num_iters=20)
        myopic_agent.set_mdp(mdp)
        actions, _ = self.run_on_env(myopic_agent,
                                     env,
                                     gamma=0.9,
                                     episode_length=10)
        self.assertEqual(actions, [s, s, e, e, e, e, e, n, stay, stay])

示例#3

0

显示文件

文件： visual_data_explanation.py 项目： wwxFromTju/learning_biases

def get_policy(agent, grid):
    """Returns the policy of the agent given"""
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)
    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec
    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)
    return action_dists

示例#4

0

显示文件

文件： visual_data_explanation.py 项目： wwxFromTju/learning_biases

def show_agents(grids,
                agent_list,
                agent_names,
                grid_names,
                filename='AgentComparison',
                figtitle=''):
    """Shows how agents perform on a gridworld

    grid - list of gridworlds (see examples in earlier part of file)
    agent_list - list of agent (objects)
    agent_names - names of agents (strings)
    """
    num_ex = len(agent_list)
    num_grids = len(grids)
    fig, axes_grid = plt.subplots(num_grids, num_ex, figsize=(14.0, 4.5))

    if num_grids == 1:
        axes_grid = [axes_grid]

    for i, axes in enumerate(axes_grid):
        # Give each gridworld a name (uncomment to do so)
        # ax.set_ylabel(grid_names[i])
        # Generate MDP
        grid = grids[i]
        mdp = GridworldMdp(grid, noise=0.2)
        walls, reward, start = mdp.convert_to_numpy_input()

        for idx, agent in enumerate(agent_list):
            ax = axes[idx]
            ax.set_aspect('equal')

            plot_reward(reward, walls, '', fig=fig, ax=ax)
            plot_trajectory(walls,
                            reward,
                            start,
                            agent,
                            arrow_width=0.35,
                            fig=fig,
                            ax=ax)
            # Only write Agent names if it's the first row
            if i == 0:
                ax.set_title(agent_names[idx],
                             fontname='Times New Roman',
                             fontsize=16)

            print('Agent {} is {}'.format(agent_names[idx], agent))

    # Increase vertical space btwn subplots
    # fig.subplots_adjust(hspace=0.2)
    # fig.suptitle(figtitle)
    fig.savefig(filename, bbox_inches='tight', dpi=500)
    print("Saved figure to {}.png".format(filename))

示例#5

0

显示文件

文件： maxent.py 项目： wwxFromTju/learning_biases

def test_visitations(grid, agent):
    """Tests the expected_counts calculation--might be einsum error"""
    # print("Testing expected_counts")
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)

    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)

    walls, rewards, start_state = mdp.convert_to_numpy_input()

    # print("Start state for given mdp:", start_state)

    start = start_state
    trans = mdp.get_transition_matrix()
    initial_states = np.zeros((len(grid), len(grid)))
    initial_states[start[1]][start[0]] = 1
    initial_states = initial_states.reshape(-1)
    policy = flatten_policy(action_dists)

    demo_counts = expected_counts(policy, trans, initial_states, 20, 0.9)

    import matplotlib.pyplot as plt
    plt.imsave("democounts", demo_counts.reshape((len(grid), len(grid))))

示例#6

0

显示文件

文件： examples.py 项目： wwxFromTju/learning_biases

def main():
    grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
            ['X', ' ', -90, -90, -90, -90, '8', ' ', 'X'],
            ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', ' ', ' ', -99, '2', ' ', ' ', ' ', 'X'],
            ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', ' ', '1', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]
    mdp = GridworldMdp(grid, living_reward=-0.01, noise=0.2)
    env = Mdp(mdp)
    opt = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20)
    naive = NaiveTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20)
    soph = fast_agents.FastSophisticatedTimeDiscountingAgent(10,
                                                             1,
                                                             gamma=0.95,
                                                             num_iters=20)
    myopic = fast_agents.FastMyopicAgent(6, gamma=0.95, num_iters=20)
    over = fast_agents.FastUncalibratedAgent(gamma=0.95,
                                             num_iters=20,
                                             calibration_factor=5)
    under = fast_agents.FastUncalibratedAgent(gamma=0.95,
                                              num_iters=20,
                                              calibration_factor=0.5)

    agents = [opt, naive, soph, myopic, over, under]
    names = [
        'Optimal', 'Naive', 'Sophisticated', 'Myopic', 'Overconfident',
        'Underconfident'
    ]
    for name, agent in zip(names, agents):
        print('{} agent'.format(name))
        agent.set_mdp(mdp)
        trajectory = run_agent(agent, env, episode_length=50, determinism=True)
        if agent == naive:
            print([a for _, a, _, _ in trajectory])
        print_training_example(mdp, trajectory)
    print(opt.values.T)

示例#7

0

显示文件

文件： maxent.py 项目： wwxFromTju/learning_biases

def test_irl(grid, agent):
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)

    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec
    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)

    walls, rewards, start_state = mdp.convert_to_numpy_input()

    # print("Start state for given mdp:", start_state)
    inferred = irl_wrapper(walls, action_dists, start_state, 20, 0.9)
    # print("---true below---")
    # print(rewards)

    return walls, start_state, inferred, rewards

示例#8

0

显示文件

文件： maxent.py 项目： wwxFromTju/learning_biases

def test_coherence(grid, agent):
    """Test that these arrays perform as expected under np.einsum"""
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)

    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)

    walls, rewards, start_state = mdp.convert_to_numpy_input()

    print("Start state for given mdp:", start_state)
    # inferred = _irl_wrapper(walls, action_dists, start_state, 20, 1.0)

    start = start_state
    trans = mdp.get_transition_matrix()
    initial_states = np.zeros((len(grid), len(grid)))
    initial_states[start[1]][start[0]] = 1
    initial_states = initial_states.reshape(-1)
    policy = flatten_policy(action_dists)

    gshape = (len(grid), len(grid))
    print("initial states")
    print('-' * 20)
    print(initial_states.reshape(gshape))
    next_states = np.einsum("i,ij,ijk -> k", initial_states, policy, trans)
    # next_states = (next_states.reshape(gshape).T).reshape(-1)
    print("first expected counts")
    print('-' * 20)
    print(next_states.reshape(gshape))
    next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans)
    print("second expected counts")
    print('-' * 20)
    print(next_states.reshape(gshape))

    next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans)
    # next_states = (next_states.reshape(gshape).T).reshape(-1)
    print("third expected counts")
    print('-' * 20)
    print(next_states.reshape(gshape))

    # for i in range(5):
    #     next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans)
    #     # next_states = (next_states.reshape(gshape).T).reshape(-1)
    #     print("{}th expected counts".format(4+i))
    #     print('-'*20)
    #     print(next_states.reshape(gshape))
    return next_states.reshape((len(grid), len(grid)))

示例#9

0

显示文件

    def optimal_agent_test(self, agent):
        grid = [
            'XXXXXXXXX', 'X9X6XA  X', 'X X X XXX', 'X      2X', 'XXXXXXXXX'
        ]
        n, s, e, w, stay = self.all_actions

        mdp = GridworldMdp(grid, living_reward=-0.1)
        env = Mdp(mdp)
        agent.set_mdp(mdp)
        start_state = mdp.get_start_state()

        # Action distribution
        action_dist = agent.get_action_distribution(start_state)
        self.assertEqual(action_dist, Distribution({s: 1}))

        # Trajectory
        actions, _ = self.run_on_env(agent, env, gamma=0.95, episode_length=10)
        self.assertEqual(actions, [s, s, w, w, w, w, n, n, stay, stay])

        # Same thing, but with a bigger discount
        mdp = GridworldMdp(grid, living_reward=-0.001)
        env = Mdp(mdp)
        agent = agents.OptimalAgent(gamma=0.5, num_iters=20)
        agent.set_mdp(mdp)
        start_state = mdp.get_start_state()

        # Values
        # Inaccurate because I ignore living reward and we only use 20
        # iterations of value iteration, so only check to 2 places
        self.assertAlmostEqual(agent.value(start_state), 0.25, places=2)

        # Action distribution
        action_dist = agent.get_action_distribution(start_state)
        self.assertEqual(action_dist, Distribution({s: 1}))

        # Trajectory
        actions, reward = self.run_on_env(agent,
                                          env,
                                          gamma=0.5,
                                          episode_length=10)
        # Again approximate comparison since we don't consider living rewards
        self.assertAlmostEqual(reward, (4 - 0.0625) / 16, places=2)
        self.assertEqual(actions,
                         [s, s, e, e, stay, stay, stay, stay, stay, stay])

        # Same thing, but with Boltzmann rationality
        agent = agents.OptimalAgent(beta=1, gamma=0.5, num_iters=20)
        agent.set_mdp(mdp)

        # Action distribution
        dist = agent.get_action_distribution(start_state).get_dict()
        nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w]
        for p in [nprob, sprob, eprob, wprob]:
            self.assertTrue(0 < p < 1)
        self.assertEqual(nprob, wprob)
        self.assertTrue(sprob > nprob)
        self.assertTrue(nprob > eprob)

        middle_state = (2, 3)
        dist = agent.get_action_distribution(middle_state).get_dict()
        nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w]
        for p in [nprob, sprob, eprob, wprob]:
            self.assertTrue(0 < p < 1)
        self.assertEqual(nprob, sprob)
        self.assertTrue(wprob > eprob)
        self.assertTrue(eprob > nprob)