Exemplo n.º 1
0
    def __init__(self):
        self.actions = ["up", "down", "left", "right"]
        self.num_actions = len(self.actions)
        self.grid_world = GridWorld()

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0
        self.state_indices = {}
        k = 0
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_indices[(i, j)] = k  # set initial value to 0
                k += 1

        self.num_states = len(self.state_values)
        self.state_values_vec = np.zeros((self.num_states))
        self.rewards = np.zeros((self.num_states))

        self.state_transition_prob = np.zeros((self.num_states, self.num_actions, self.num_states))
        #self.get_observation_by_random(20000)

        self.discount = 0.99
Exemplo n.º 2
0
class LevelGrid(Level):
    """
  level with grid
  """
    def __init__(self, screen, screen_size, mic=None):

        from grid_world import GridWorld

        # parent class init
        super().__init__(screen, screen_size)

        # new vars
        self.mic = mic

        # create gridworld
        self.grid_world = GridWorld(self.screen_size, self.color_bag, self.mic)

        # setup
        self.setup_level()

        # append interactable
        #self.interactables.append(self.grid_world)
        self.interactable_dict.update({'grid_world': self.grid_world})

        # sprites
        self.all_sprites.add(self.grid_world.wall_sprites,
                             self.grid_world.move_wall_sprites)

    def setup_level(self):
        """
    setup level
    """

        # set walls
        self.setup_wall_edge()

        # create walls
        self.grid_world.create_walls()

    def setup_wall_edge(self):
        """
    limit edges
    """

        # set walls
        self.grid_world.wall_grid[:, 0] = 1
        self.grid_world.wall_grid[:, -1] = 1
        self.grid_world.wall_grid[0, :] = 1
        self.grid_world.wall_grid[-1, :] = 1
Exemplo n.º 3
0
    def __init__(self):
        self.states = []  # record position and action taken at the position
        self.actions = ["up", "down", "left", "right"]
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9

        # initial Q values
        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict
Exemplo n.º 4
0
def run_grid_world():
    world = GridWorld('simple_grid.txt', -0.01, include_treasure=True)
    print('# of states: {}'.format(len(world.all_states)))

    # uncomment this after the transition matrix has been saved
    #transitions = GridWorld.read_transition_matrix_file('simple_grid_t_matrix.csv')
    transitions = world.get_transition_matrix(
        save_to='simple_grid_t_matrix.csv')
    reward = world.get_reward_matrix()

    run_value_iteration_grid_world(world, transitions, reward)
    run_policy_iteration_grid_world(world, transitions, reward)
    compare_different_gamma_policies(world, transitions, reward)
    get_graphs_and_time_stats_grid_world_mdp(world, transitions)
    find_converged_policy(world, transitions)

    run_q_learning_grid_world()
    get_graph_q_learning()
	def objectDist(self, start, obj):
		"""
			Return cost of going to some object
		"""

		# Generate a grid that only cares about 
		# getting to the input 'obj'
		objectGrid = copy.deepcopy(self.grid)
		objValue = objectGrid.objects[obj] 
		objectGrid.objects.clear()
		objectGrid.objects[obj] = objValue

		# Simulate GridWorld where only goal is obj
		objectWorld = GridWorld(objectGrid, [10])
		startCoord = self.grid.objects[start]

		# Count num. of steps to get to obj from start, that is the distance
		dist = objectWorld.simulate(objectWorld.coordToScalar(startCoord))

		return dist 
Exemplo n.º 6
0
    def objectDist(self, start, obj):
        """
			Return cost of going to some object
		"""

        # Generate a grid that only cares about
        # getting to the input 'obj'
        objectGrid = copy.deepcopy(self.grid)
        objValue = objectGrid.objects[obj]
        objectGrid.objects.clear()
        objectGrid.objects[obj] = objValue

        # Simulate GridWorld where only goal is obj
        objectWorld = GridWorld(objectGrid, [10])
        startCoord = self.grid.objects[start]

        # Count num. of steps to get to obj from start, that is the distance
        dist = objectWorld.simulate(objectWorld.coordToScalar(startCoord))

        return dist
Exemplo n.º 7
0
    def __init__(self, screen, screen_size, mic=None):

        from grid_world import GridWorld

        # parent class init
        super().__init__(screen, screen_size)

        # new vars
        self.mic = mic

        # create gridworld
        self.grid_world = GridWorld(self.screen_size, self.color_bag, self.mic)

        # setup
        self.setup_level()

        # append interactable
        self.interactables.append(self.grid_world)

        # sprites
        self.all_sprites.add(self.grid_world.wall_sprites,
                             self.grid_world.move_wall_sprites)
class IterativePolicyEvaluation(object):
    def __init__(self):
        self.grid = GridWorld()
        self.rewards = rewards
        self.actions = actions

    def initialize_V(self):
        V = {}
        S = []
        for i in range(self.grid.rows):
            for j in range(self.grid.cols):
                V[(i, j)] = 0
                if (i, j) not in [death, goal]:
                    S.append((i, j))
        self.V = V
        self.S = S
        self.dynamic_p = 1.0 / len(S)

    def value_step(self):
        diff = 0
        old_V = self.V
        for s in self.S:
            new_v = 0
            old_v = old_V[s]
            for a in self.actions:
                self.grid.set_state(s)
                self.grid.move(a)
                s_new = self.grid.current_state()
                r = self.rewards.get(s_new, 0)
                if self.grid.game_over(s_new):
                    new_v += self.dynamic_p * r
                    break
                else:
                    new_v += self.dynamic_p * (r + gamma * self.V[s_new])
            self.V[s] = new_v
            diff = max(diff, np.abs(old_v - new_v))
        return diff

    def policy_evaluation(self):
        self.initialize_V()
        while True:
            diff = self.value_step()
            if diff < delta:
                self.print_values()
                return None

    def print_values(self):
        for i in range(self.grid.rows):
            print("------------------------")
            for j in range(self.grid.cols):
                v = self.V.get((i, j), 0)
                if v >= 0:
                    print(" %.2f|" % v, end="")
                else:
                    print("%.2f|" % v, end="")
            print("")
        print("------------------------")
Exemplo n.º 9
0
    def buildBiasEngine(self):
        """ 
			Simulates the GridWorlds necessary to conduct inference.
		"""

        # Builds/solves gridworld for each objectGrid, generating policies for
        # each object in grid. One for going only to A, another just for B, etc.
        for i in range(len(self.objectGrids)):

            simsBuffer = list()
            for j in range(len(self.objectGrids[0])):
                simsBuffer.append(
                    GridWorld(self.objectGrids[i][j], [10], self.discount,
                              self.tau, self.epsilon))

            self.sims.append(simsBuffer)
Exemplo n.º 10
0
def test_gridworld_q_learning():
    np.random.seed(0)

    N = 5
    goal_pos = np.array([[N-1, N-1]])
    human_pos = np.array([[N-1, 0]])
    human_radius = 2

    grid = np.ones((N, N), dtype=float) * -1
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=0.8,
        render=True,
    )

    mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    mdp_algo.run()
    policy = StochasticGreedyPolicy(
        env.action_space(), mdp_algo, env.transition)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
Exemplo n.º 11
0
def test_gridworld_value_iteration():
    np.random.seed(0)

    N = 10
    goal_pos = np.array([[N-1, N-1], [N-1, N-2]])
    human_pos = np.array([[N//2, N//2], [N-1, 0]])
    human_radius = 3

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=True,
    )

    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = EpsGreedyPolicy(env.action_space(), mdp_algo)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plot_policy(policy, (N, N), "Policy", values=V, cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
Exemplo n.º 12
0
    def __init__(self):
        self.actions = ["up", "down", "left", "right"]
        self.num_actions = len(self.actions)
        self.grid_world = GridWorld()

        # initial state reward
        self.state_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_values[(i, j)] = 0  # set initial value to 0
        self.state_indices = {}
        k = 0
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.state_indices[(i, j)] = k  # set initial value to 0
                k += 1
        
        self.num_states = len(self.state_values)
        self.state_values_vec = np.zeros((self.num_states))
        self.rewards = np.zeros((self.num_states))

        self.state_transition_prob = np.zeros((self.num_states, self.num_actions, self.num_states))
        for state in self.state_values.keys():
            self.rewards[self.state_indices[state]] = self.giveReward(state)
            for action in self.actions:
                if action == "up":
                    action_probs = zip(["up", "left", "right"], [0.8, 0.1, 0.1])
                if action == "down":
                    action_probs = zip(["down", "left", "right"], [0.8, 0.1, 0.1])
                if action == "left":
                    action_probs = zip(["left", "up", "down"], [0.8, 0.1, 0.1])
                if action == "right":
                    action_probs = zip(["right", "up", "down"], [0.8, 0.1, 0.1])
                for a, p in action_probs:
                    nxtState = self.nxtPosition(state, a)
                    self.state_transition_prob[self.state_indices[state], self.actions.index(a), self.state_indices[nxtState]] += p
        
        self.discount = 0.99
Exemplo n.º 13
0
def main():
    logging.basicConfig(level=logging.INFO)
    action_probs, special_nodes, grid_dims, start_pos, default_r = configure_world_options(
        option=WORLD_OPTION)

    grid_world = GridWorld(start_pos=start_pos,
                           action_probs=action_probs,
                           grid_dims=grid_dims,
                           default_reward=default_r,
                           special_nodes=special_nodes,
                           heuristic=True)
    # grid_world.print_info()
    grid_world.visualize_heuristic(store_path=STORE_PATH)

    # UNINFORMED SEARCH

    # Breadth-First Search
    breadth_first_agent = BreadthlyCooper(world=grid_world, debug=True)
    breadth_first_agent.solve()
    breadth_first_agent.visualize(store_path=STORE_PATH)

    # Branch-And-Bound Search
    branch_and_bound_agent = BreadthlyCooper(world=grid_world,
                                             debug=True,
                                             b_bound=True)
    branch_and_bound_agent.solve()
    branch_and_bound_agent.visualize(store_path=STORE_PATH)

    # Depth-First Search
    depth_first_agent = JohnnyDeppth(world=grid_world, debug=True)
    depth_first_agent.solve()
    depth_first_agent.visualize(store_path=STORE_PATH)

    # INFORMED SEARCH

    # Greedy-Best-First Search
    best_first_agent = AStarIsClimbing(world=grid_world, debug=True, alg=1)
    best_first_agent.solve()
    best_first_agent.visualize(store_path=STORE_PATH)

    # Hill-Climbing Search
    hill_climbing_agent = AStarIsClimbing(world=grid_world, debug=True, alg=2)
    hill_climbing_agent.solve()
    hill_climbing_agent.visualize(store_path=STORE_PATH)

    # A* Search
    a_star_agent = AStarIsClimbing(world=grid_world, debug=True)
    a_star_agent.solve()
    a_star_agent.visualize(store_path=STORE_PATH)

    # ITERATIVE PLANNING ALGORITHMS

    q_iter_agent = QIteration(world=grid_world,
                              debug=True,
                              gamma=GAMMA,
                              error=ERROR)
    q_iter_agent.solve()
    q_iter_agent.visualize(store_path=STORE_PATH)

    v_iter_agent = ValueIteration(world=grid_world,
                                  debug=True,
                                  gamma=GAMMA,
                                  error=ERROR)
    v_iter_agent.solve()
    v_iter_agent.visualize(store_path=STORE_PATH)

    p_iter_agent = PolicyIteration(world=grid_world,
                                   debug=True,
                                   gamma=GAMMA,
                                   error=ERROR,
                                   policy_shift_max=10)
    p_iter_agent.solve()
    p_iter_agent.visualize(store_path=STORE_PATH)

    # LEARNING ALGORITHMS

    exp_start_mc = OnMonteCarlo(world=grid_world,
                                debug=False,
                                gamma=0.9,
                                epsilon=0.3,
                                num_episodes=1000,
                                explore_starts=False,
                                max_steps=100)
    exp_start_mc.solve()
    exp_start_mc.visualize(store_path=STORE_PATH)

    exp_start_mc = OffMonteCarlo(world=grid_world,
                                 debug=False,
                                 gamma=0.9,
                                 epsilon=0.3,
                                 num_episodes=1000,
                                 max_steps=100)
    exp_start_mc.solve()
    exp_start_mc.visualize(store_path=STORE_PATH)

    qq7 = QLearning(world=grid_world,
                    debug=True,
                    gamma=0.9,
                    epsilon=0.3,
                    num_episodes=1000,
                    step=ALPHA)
    qq7.solve()
    qq7.visualize(store_path=STORE_PATH)

    sarsa = SARSA(world=grid_world,
                  debug=False,
                  gamma=0.9,
                  epsilon=0.3,
                  num_episodes=1000,
                  expected=False,
                  step=ALPHA)
    sarsa.solve()
    sarsa.visualize(store_path=STORE_PATH)

    e_sarsa = SARSA(world=grid_world,
                    debug=False,
                    gamma=0.9,
                    epsilon=0.3,
                    num_episodes=1000,
                    expected=True,
                    step=ALPHA)
    e_sarsa.solve()
    e_sarsa.visualize(store_path=STORE_PATH)
Exemplo n.º 14
0
import warnings
warnings.filterwarnings("ignore")

ITER = 30
EPISODES = 2000

REWARD_MATRIX = []
FIDELITY_MATRIX = []
ENV_LIST = []
AGENT_LIST = []

for i in range(ITER):
    reward_list = []
    fidelity_list = []

    env = GridWorld(shape=(3, 5), reward=(100, 0, 200), num_of_w=1, num_of_p=1, num_of_r=1, num_of_steps=10)

    Agent = DoubleQNetworkAgent(
        n_obs=env.state_space - 1,
        n_action=env.action_space,
        units_layer=(8, 8),
        learning_rate=0.0025,
        name='dqn_' + str(i),
        gamma=0.9,
        buffer_size=500, batch_size=10, target_change_step=10, min_buffer_size=100,
        load_path=None,
        save_path=None
    )

    for episode in range(EPISODES):
Exemplo n.º 15
0
from grid_world import GridWorld
game = GridWorld(size=4, mode='static')

print(game.display())
game.makeMove('d')
game.makeMove('d')
game.makeMove('l')
print(game.display())
print(game.reward())
Exemplo n.º 16
0
        print(']')


CORRECT_ACTION_PROB = 1.0  # probability of correctly executing the chosen action
GAMMA = 1.0  # discount factor
#CORRECT_ACTION_PROB = 0.8  # probability of correctly executing the chosen action
#GAMMA = 0.98  # discount factor

np.random.seed(0)

dimensions = (6, 6)
num_obstacles = 6
goal_state = (5, 5)

# Instantiating the grid world
grid_world = GridWorld(dimensions, num_obstacles, goal_state,
                       CORRECT_ACTION_PROB, GAMMA)

# Testing policy evaluation
print(
    'Evaluating random policy, except for the goal state, where policy always executes stop:'
)
policy = random_policy(grid_world)
policy[goal_state[0], goal_state[1], STOP] = 1.0
policy[goal_state[0], goal_state[1],
       UP:NUM_ACTIONS] = np.zeros(NUM_ACTIONS - 1)
initial_value = np.zeros(dimensions)
value = policy_evaluation(grid_world, initial_value, policy)
print_value(value)
print_policy(policy)
print('----------------------------------------------------------------\n')
class PolicyIteration(object):
    def __init__(self):
        self.grid = GridWorld()
        self.rewards = rewards
        self.actions = actions

    def initialize_V(self):
        V = {}
        S = []
        for i in range(self.grid.rows):
            for j in range(self.grid.cols):
                V[(i, j)] = 0
                if (i, j) not in [death, goal]:
                    S.append((i, j))
        self.V = V
        self.S = S
        self.dynamic_p = 1.0 / len(S)

    def initialize_P(self):
        P = {}
        for i in range(self.grid.rows):
            for j in range(self.grid.cols):
                if (i, j) not in [death, goal]:
                    P[(i, j)] = np.random.choice(self.actions)
        self.P = P

    def value_step(self):
        diff = 0
        old_V = self.V
        for s in self.S:
            new_v = 0
            old_v = old_V[s]
            for a in self.actions:
                self.grid.set_state(s)
                self.grid.move(a)
                s_new = self.grid.current_state()
                r = self.rewards.get(s_new, 0)
                new_v += self.dynamic_p * (r + gamma * self.V[s_new])
            self.V[s] = new_v
            diff = max(diff, np.abs(old_v - new_v))
        return diff

    def policy_evaluation(self):
        while True:
            diff = self.value_step()
            if diff < delta:
                return None

    def improvement_step(self):
        policy_stable = True
        for s in self.S:
            old_action = self.P[s]
            action_values = []
            for a in self.actions:
                self.grid.set_state(s)
                self.grid.move(a)
                s_new = self.grid.current_state()
                r = self.rewards.get(s_new, 0)
                new_v = r + gamma * self.V[s_new]
                action_values.append(new_v)

            idx = np.argmax(action_values)
            new_action = self.actions[idx]
            self.P[s] = new_action
            if old_action != new_action:
                policy_stable = False

        return policy_stable

    def policy_iteration(self):
        self.initialize_V()
        self.initialize_P()
        print("Initial Policy:")
        self.print_policy()
        while True:
            self.policy_evaluation()
            if self.improvement_step():

                print("Final Policy:")
                self.print_policy()

                print("Final Values:")
                self.print_values()
                return None

    def print_values(self):
        for i in range(self.grid.rows):
            print("------------------------")
            for j in range(self.grid.cols):
                v = self.V.get((i, j), 0)
                if v >= 0:
                    print(" %.2f|" % v, end="")
                else:
                    print("%.2f|" % v, end="")
            print("")
        print("------------------------")

    def print_policy(self):
        for i in range(self.grid.rows):
            print("---------------------------")
            for j in range(self.grid.cols):
                a = self.P.get((i, j), '#')
                print("  %s  |" % a, end="")
            print("")
        print("------------------------")
Exemplo n.º 18
0
def test_feature_gridworld_maxent_irl():
    np.random.seed(0)

    # env
    N = 15

    init_pos = np.zeros((N, N), dtype=float)
    for i, j in product(range(N // 2 + 2, N), range(N // 2 + 2, N)):
        init_pos[i, j] = i**2 + j**2
    init_pos /= np.sum(init_pos)
    goal_pos = np.array([[n, 0] for n in range(N)])

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_feature_boundary_reward(
        grid,
        boundary_axis=0,
        boundary_value=N // 2,
        reward=-10,
        exp_constant=0.2,
    )

    plot_grid_map(init_pos.T,
                  "Initial Position Distribution",
                  cmap=plt.cm.Blues)
    plot_grid_map(grid.T, "Reward (Ground Truth)", cmap=plt.cm.Reds)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=init_pos,
        goal_pos=goal_pos,
        reward_grid=grid,
        action_success_rate=1,
        render=False,
    )

    # learn a policy
    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = StochasticGreedyPolicy(env.action_space(), mdp_algo,
                                    env.transition)

    # roll out trajectories
    dataset = collect_trajectories(policy=policy,
                                   env=env,
                                   num_trajectories=20,
                                   maxlen=N * 2)
    plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution")

    # IRL feature map
    feature_map = [
        env._feature_map(s) for s in range(env.observation_space().n)
    ]
    feature_map = np.array(feature_map)

    # IRL
    me_irl = MaxEntIRL(observation_space=env.observation_space(),
                       action_space=env.action_space(),
                       transition=env.transition,
                       goal_states=env.goal_states,
                       dataset=dataset,
                       feature_map=feature_map,
                       max_iter=10,
                       lr=0.1,
                       anneal_rate=0.9)
    Rprime = me_irl.train()
    Rprime = Rprime.reshape((N, N)).T

    # plot results
    plot_grid_map(Rprime, "Reward (IRL)", cmap=plt.cm.Blues)
    plt.show()
Exemplo n.º 19
0
def run_q_learning_grid_world():
    world = GridWorld('simple_grid.txt', -0.01, include_treasure=False)
    n_episodes = 500000
    how_often = n_episodes / 500

    stats = IterationStats('stats/ql_simple_grid.csv', dims=5)

    def on_update(state, action, next_state, q_learner):
        #print('[{},{}] - {} -> [{},{}]'.format(state.x, state.y, action[0], next_state.x, next_state.y))
        pass

    def on_episode(episode, time, q_learner, q):
        world.print_policy(print, q_learner.get_policy())
        stats.save_iteration(episode, time,
                             numpy.nanmean(numpy.nanmax(q, axis=0)), q)
        #time.sleep(1)

    for state in world.get_states():
        if state.tile_type == GridWorldTile.GOAL:
            goal_state = state
            break

    def initialize_toward_goal(state: GridWorldTile):
        actions = state.get_actions()
        if len(actions) == 0:
            return []
        diff_x = goal_state.x - state.x
        diff_y = goal_state.y - state.y
        best_value = 0.1
        if len(actions) == 5 and actions[4][0].startswith('get treasure'):
            best_action = actions[4][0]
        elif abs(diff_x) >= abs(diff_y):
            if diff_x > 0:
                best_action = 'move east'
            else:
                best_action = 'move west'
        else:
            if diff_y < 0:
                best_action = 'move north'
            else:
                best_action = 'move south'
        values = [-0.1] * len(actions)
        for i, action in enumerate(actions):
            if action[0] == best_action:
                values[i] = best_value
        return values

    gamma = 0.99
    q_l = QLearning(world,
                    0.5,
                    0.05,
                    gamma,
                    on_update=on_update,
                    on_episode=on_episode,
                    initializer=initialize_toward_goal,
                    start_at_0=True,
                    alpha=0.1,
                    every_n_episode=how_often)
    stats.start_writing()
    q_l.run(n_episodes)
    stats.done_writing()
    world.print_policy(print, q_l.get_policy())
def costly_walk_builder():
	rewards = {(i, j): -0.5 for i in range(4) for j in range(3)}
	rewards[(3, 2)] = 1.
	rewards[(3, 1)] = -1.
	env = GridWorld(rewards)
	return env
Exemplo n.º 21
0
from grid_world import GridWorld

gw = GridWorld(num_rows=4, num_columns=4)
gw.current_state = 14
print(gw.step('RIGHT'))
Exemplo n.º 22
0
def main():
    # Test world
    WORLD_OPTION = 2
    USE_DYNAMIC_DEFAULT = True

    STORE_PATH = '/Users/djordje/ML/personal/RL/rl_projects/grid_world_playground/experimentation/test/'

    # Planning ground truth params
    ERROR = 0.00001

    # Learning Agent Params
    GAMMA = 0.9
    EPSILON = 0.3
    ALPHA = 0.01
    DECAY = False

    NUM_EP = 10000
    MAX_STEPS = 1000

    # Debug Params
    DEBUG_FREQ = 100

    STATES_OF_INTEREST = {(5, 0), (5, 2), (3, 2), (3, 3), (3, 4), (3, 5),
                          (4, 5), (0, 2), (1, 3)}

    # Agent tags: ['Q', 'SARSA', 'ESARSA', 'MC-ES', 'MC-Soft', 'MC-ES-Soft']
    AGENTS_TO_TEST = ['Q', 'ESARSA', 'MC-ES', 'MC-ES-Soft']

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    action_probs, special_nodes, grid_dims, start_pos, default_r = configure_world_options(
        option=WORLD_OPTION)
    if USE_DYNAMIC_DEFAULT is True:
        default_r = -1 / MAX_STEPS

    grid_world = GridWorld(start_pos=start_pos,
                           action_probs=action_probs,
                           grid_dims=grid_dims,
                           default_reward=default_r,
                           special_nodes=special_nodes,
                           heuristic=True)

    ground_truth_u, ground_truth_ret = obtain_optimal_values(
        world=grid_world,
        error=ERROR,
        gamma=GAMMA,
        store_path=STORE_PATH,
        states_of_interest=STATES_OF_INTEREST)

    ground_truth_u_plt = plot_ready_utils(optimal_utilities=ground_truth_u,
                                          debug_freq=DEBUG_FREQ,
                                          num_episodes=NUM_EP)

    ground_truth_ret_plt = plot_ready_returns(optimal_returns=ground_truth_ret,
                                              debug_freq=DEBUG_FREQ,
                                              num_episodes=NUM_EP)

    comparison_utilities = {}
    comparison_returns = {}

    for agent_tag in AGENTS_TO_TEST:
        utility_arg = {}

        agent = get_agent(tag=agent_tag,
                          world=grid_world,
                          gamma=GAMMA,
                          max_steps=MAX_STEPS,
                          num_ep=NUM_EP,
                          epsilon=EPSILON,
                          alpha=ALPHA,
                          decay=DECAY,
                          debug_freq=DEBUG_FREQ)

        logger.info('%s: Running.\n', agent_tag)

        agent.solve()
        if STORE_PATH is not None:
            agent.visualize(store_path=STORE_PATH)

        utilities = agent.export_utility_history(
            state_subset=STATES_OF_INTEREST)
        returns = agent.export_returns_history()

        detailed_tag = agent.get_model_name()
        comparison_utilities[agent_tag] = utilities
        comparison_returns[agent_tag] = returns

        utility_arg[detailed_tag] = utilities

        logger.info('%s Visualizing utility convergence.\n', agent_tag)
        visualize_utilities(ground_truth_dict=ground_truth_u_plt,
                            utility_history_dicts=utility_arg,
                            store_path=STORE_PATH,
                            tag=detailed_tag + '_state_plt')

    logger.info('Visualizing utility convergence comparison.\n')
    visualize_utilities(ground_truth_dict=ground_truth_u_plt,
                        utility_history_dicts=comparison_utilities,
                        store_path=STORE_PATH,
                        tag='agent_utility_comparison')

    logger.info('Visualizing returns convergence comparison.\n')
    visualize_returns(ground_truth=ground_truth_ret_plt,
                      returns_dict=comparison_returns,
                      store_path=STORE_PATH,
                      tag='agent_returns_comparison')
Exemplo n.º 23
0
        print()
    print(line)


def show_policy(V):
    line = '-' * 10
    print(line)
    for i in range(3):
        for j in range(4):
            val = V.get((i, j), " ")
            print(val, ' ', end='')
        print()
    print(line)


grid_world = GridWorld.default_game()

states = grid_world.all_states()

#All possible actions
ALL_ACTIONS = ['U', 'D', 'L', 'R']

#creates the initial Value function
V = {
    state: random.random() if not grid_world.is_terminal(state) else 0
    for state in states
}

#creates the inital policy
policy = {action: random.choice(ALL_ACTIONS) for action in grid_world._actions}
Exemplo n.º 24
0
def test_gridworld_maxent_irl():
    np.random.seed(0)

    # env
    N = 10
    goal_pos = np.array([[N - 1, N - 1]])
    human_pos = np.array([[3, 3]])
    human_radius = 2

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=False,
    )

    # learn a policy
    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    # mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    # policy = GreedyPolicy(env.action_space(), mdp_algo)
    # policy = EpsGreedyPolicy(env.action_space(), mdp_algo, epsilon=0.1)
    policy = StochasticGreedyPolicy(env.action_space(), mdp_algo,
                                    env.transition)

    V = np.asarray(mdp_algo.V).reshape((N, N)).T
    R = env.reward.reshape((N, N)).T
    plot_grid_map(R, "Reward (Ground Truth)", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)

    # roll out trajectories
    dataset = collect_trajectories(policy=policy,
                                   env=env,
                                   num_trajectories=200,
                                   maxlen=N * 2)
    plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution")

    # feature map
    feature_map = [
        env._feature_map(s) for s in range(env.observation_space().n)
    ]
    feature_map = np.array(feature_map)

    # IRL
    me_irl = MaxEntIRL(observation_space=env.observation_space(),
                       action_space=env.action_space(),
                       transition=env.transition,
                       goal_states=env.goal_states,
                       dataset=dataset,
                       feature_map=feature_map,
                       max_iter=10,
                       lr=0.1,
                       anneal_rate=0.9)
    Rprime = me_irl.train()
    Rprime = Rprime.reshape((N, N)).T

    # plot results
    plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues)
    plt.show()
Exemplo n.º 25
0
class TestGridWorld(unittest.TestCase):

    def setUp(self):
        height = 6
        width = 5
        self.height = height
        self.width = width
        self.grid_world = GridWorld(width, height, (0,0), (width-1 , height-1))

    def test_grid(self):
        a = [[False for x in range(self.width)] for x in range(self.height)]
        self.assertEqual(self.grid_world.grid, a)
    
    def test_get_neighbors(self):
        a = [(1,0), (0,1)]
        self.assertEqual(self.grid_world.get_neighbors(0,0), a)
        
        a = [(1,0),(2,1),(1,2),(0,1)]
        self.assertEqual(self.grid_world.get_neighbors(1,1), a)
        
    def test_build_on_endpoint(self):
        endpoint = self.grid_world.endpoint
        self.assertFalse(self.grid_world.build_tower(endpoint[0], endpoint[1]))

    def test_build_out_of_bounds(self):
        self.assertFalse(self.grid_world.build_tower(0, self.height+1))
        self.assertFalse(self.grid_world.build_tower(self.width+1, 0))
    
    def test_build_on_spawnpoint(self):
        spawnpoint = self.grid_world.spawnpoint
        self.assertFalse(self.grid_world.build_tower(spawnpoint[0], spawnpoint[1]))

    def test_build_on_tower(self):
        self.assertTrue(self.grid_world.build_tower(2,1))
        self.assertFalse(self.grid_world.build_tower(2,1))

    def test_block_path1(self):#try to build around the endpoint
        endpoint = self.grid_world.endpoint
        self.assertTrue(self.grid_world.build_tower(endpoint[0]-1, endpoint[1]))
        self.assertTrue(self.grid_world.build_tower(endpoint[0]-1, endpoint[1]-1))    
        self.assertFalse(self.grid_world.build_tower(endpoint[0], endpoint[1]-1))

    def test_block_path2(self):#try to build through the center
        self.assertTrue(self.grid_world.build_tower(0, 1))
        self.assertTrue(self.grid_world.build_tower(1, 1))
        self.assertTrue(self.grid_world.build_tower(2, 1))
        self.assertTrue(self.grid_world.build_tower(3, 1))
        self.assertFalse(self.grid_world.build_tower(4, 1))

    def test_best_path(self):
        self.grid_world.build_tower(0, 1)
        self.grid_world.build_tower(1, 1)
        self.grid_world.build_tower(2, 1)
        self.grid_world.build_tower(3, 1)

        self.grid_world.build_tower(4, 3)
        self.grid_world.build_tower(3, 3)
        self.grid_world.build_tower(2, 3)
        self.grid_world.build_tower(1, 3)

        self.grid_world.build_tower(0, 5)
        self.grid_world.build_tower(1, 5)
        self.grid_world.build_tower(2, 5)
        self.grid_world.build_tower(3, 5)

        bestPath = {(1, 2): (0, 2), (3, 2): (2, 2), (0, 0): (1, 0), (2, 0): (3, 0), (4, 5): None, (4, 1): (4, 2), (4, 4): (4, 5), (2, 2): (1, 2), (1, 4): (2, 4), (0, 2): (0, 3), (3, 0): (4, 0), (0, 4): (1, 4), (1, 0): (2, 0), (4, 2): (3, 2), (0, 3): (0, 4), (3, 4): (4, 4), (2, 4): (3, 4), (4, 0): (4, 1)}


        self.assertEqual(self.grid_world.get_path(self.grid_world.grid), bestPath)

    def test_diagonal_movement(self):
        self.assertTrue(self.grid_world.build_tower(0, 1))
        self.assertFalse(self.grid_world.build_tower(1, 0))
Exemplo n.º 26
0
def main(cfg):
    pygame.init()

    # フォントの作成
    sysfont = pygame.font.SysFont(None, 40)
    screen = pygame.display.set_mode(WINDOW_SIZE)
    pygame.display.set_caption("Grid World")

    done = False

    clock = pygame.time.Clock()

    # grid worldの初期化
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    agent = QLearningAgent(
        epsilon=cfg["agent"]["epsilon"],
        epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"],
        actions=np.arange(4),
        observation=ini_state)  # Q学習エージェント

    nb_episode = cfg["nb_episode"]  # エピソード数
    save_interval = cfg["save_interval"]
    result_dir = cfg["result_dir"]
    max_step = 1
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?

    step = 0
    # time.sleep(30)

    for episode in range(nb_episode):
        print("episode:", episode)
        episode_reward = []  # 1エピソードの累積報酬
        step = 0
        while (is_end_episode is False and step < max_step):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)

            screen.fill(BLACK)
            # grid worldの描画
            draw_grid_world(grid_env.map, screen)
            # テキストを描画したSurfaceを作成
            step_str = sysfont.render("step:{}".format(step), False, WHITE)
            # 位# テキストを描画する
            screen.blit(step_str, (500, 50))
            clock.tick(1)
            step += 1

            # 再描画
            pygame.display.flip()

        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  # 初期化
        agent.observe(state)  # エージェントを初期位置に
        is_end_episode = False
        print("step:", step)
        agents = [agent]

        if episode % save_interval == 0:
            save_result(agents, episode, result_dir)

    pygame.quit()
Exemplo n.º 27
0
 def takeAction(self, action):
     position = self.grid_world.nxtPosition(action)
     # update GridWorld
     return GridWorld(state=position)
 def __init__(self):
     self.grid = GridWorld()
     self.rewards = rewards
     self.actions = actions
Exemplo n.º 29
0
 def reset(self):
     self.states = []
     self.grid_world = GridWorld()
     self.isEnd = self.grid_world.isEnd
def basic_env_builder():
	rewards = {(i, j): 0.0 for i in range(4) for j in range(3)}
	rewards[(3, 2)] = 1.
	rewards[(3, 1)] = -1.
	env = GridWorld(rewards)
	return env
Exemplo n.º 31
0
 def setUp(self):
     height = 6
     width = 5
     self.height = height
     self.width = width
     self.grid_world = GridWorld(width, height, (0,0), (width-1 , height-1))
Exemplo n.º 32
0
from grid_world import GridWorld
from numpy import Infinity
from utils import *

import random

grid_world = GridWorld.painful_game()

states = grid_world.all_states()

#creates the initial Q function
#state: value q, number of visits.
Q = {}
for state in states:
    if not grid_world.is_terminal(state):
        for action in grid_world._actions[state]:
            Q.update({(state, action): [0, 0]})

policy = {}

for state in grid_world._actions:
    action = random.choice(ALL_ACTIONS)
    while action not in grid_world._actions[state]:
        action = random.choice(ALL_ACTIONS)
    policy.update({state: action})

gamma = 0.9

show_policy(policy)
'''
    The main loops of this process, it's stoped later
Exemplo n.º 33
0
class Agent:

    def __init__(self):
        self.states = []  # record position and action taken at the position
        self.actions = ["up", "down", "left", "right"]
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9

        # initial Q values
        self.Q_values = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0  # Q value is a dict of dict

    def chooseAction(self):
        # choose action with most expected value
        mx_nxt_reward = 0
        action = ""

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        else:
            # greedy action
            for a in self.actions:
                current_position = self.grid_world.state
                nxt_reward = self.Q_values[current_position][a]
                if nxt_reward >= mx_nxt_reward:
                    action = a
                    mx_nxt_reward = nxt_reward
            # print("current pos: {}, greedy aciton: {}".format(self.grid_world.state, action))
        return action

    def takeAction(self, action):
        position = self.grid_world.nxtPosition(action)
        # update GridWorld
        return GridWorld(state=position)

    def reset(self):
        self.states = []
        self.grid_world = GridWorld()
        self.isEnd = self.grid_world.isEnd

    def play(self, rounds=10):
        i = 0
        while i < rounds:
            # to the end of game back propagate reward
            if self.grid_world.isEnd:
                # back propagate
                reward = self.grid_world.giveReward()
                for a in self.actions:
                    self.Q_values[self.grid_world.state][a] = reward
                print("Game End Reward", reward)
                for s in reversed(self.states):
                    current_q_value = self.Q_values[s[0]][s[1]]
                    reward = current_q_value + self.lr * (self.decay_gamma * reward - current_q_value)
                    self.Q_values[s[0]][s[1]] = round(reward, 3)
                self.reset()
                i += 1
            else:
                action = self.chooseAction()
                # append trace
                self.states.append([(self.grid_world.state), action])
                print("current position {} action {}".format(self.grid_world.state, action))
                # by taking the action, it reaches the next state
                self.grid_world = self.takeAction(action)
                # mark is end
                self.grid_world.isEndFunc()
                print("nxt state", self.grid_world.state)
                print("---------------------")
                self.isEnd = self.grid_world.isEnd