예제 #1
0
def make_mdp(mdp_class="grid", grid_dim=7):
    '''
    Returns:
        (MDP)
    '''
    # Grid/Hallway stuff.
    width, height = grid_dim, grid_dim
    upworld_goal_locs = [(i, width) for i in range(1, height+1)]

    four_room_goal_locs = [(width, height)] #, (width, 1), (1, height)] # (1, height - 2), (width - 2, height - 2), (width - 1, height - 1), (width - 2, 1)]
    four_room_goal_loc = four_room_goal_locs[0]

    # Taxi stuff.
    agent = {"x":1, "y":1, "has_passenger":0}
    passengers = [{"x":grid_dim / 2, "y":grid_dim / 2, "dest_x":grid_dim-2, "dest_y":2, "in_taxi":0}]
    walls = []

    # Trench stuff
    tr_agent = {"x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": grid_dim, "dest_y": grid_dim, "has_block": 0}
    blocks = [{"x": grid_dim, "y": 1}]
    lavas = [{"x": x, "y": y} for x, y in map(lambda z: (z + 1, (grid_dim + 1) / 2), range(grid_dim))]

    # Do grids separately to avoid making error-prone domains.
    if mdp_class == "four_room":
        mdp = FourRoomMDP(width=width, height=height, goal_locs=[four_room_goal_loc])
    else:
        mdp = {"upworld":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=upworld_goal_locs),
            "chain":ChainMDP(num_states=grid_dim),
            "random":RandomMDP(num_states=50, num_rand_trans=2),
            "hanoi":HanoiMDP(num_pegs=grid_dim, num_discs=3),
            "taxi":TaxiOOMDP(width=grid_dim, height=grid_dim, agent=agent, walls=walls, passengers=passengers),
            "trench":TrenchOOMDP(width=grid_dim, height=3, agent=tr_agent, blocks=blocks, lavas=lavas)}[mdp_class]

    return mdp
예제 #2
0
def choose_mdp(mdp_name, env_name="Asteroids-v0"):
    '''
    Args:
        mdp_name (str): one of {gym, grid, chain, taxi, ...}
        gym_env_name (str): gym environment name, like 'CartPole-v0'

    Returns:
        (MDP)
    '''

    # Other imports
    from simple_rl.tasks import ChainMDP, GridWorldMDP, FourRoomMDP, TaxiOOMDP, RandomMDP, PrisonersDilemmaMDP, RockPaperScissorsMDP, GridGameMDP

    # Taxi MDP.
    agent = {"x":1, "y":1, "has_passenger":0}
    passengers = [{"x":4, "y":3, "dest_x":2, "dest_y":2, "in_taxi":0}]
    walls = []
    if mdp_name == "gym":
        # OpenAI Gym MDP.
        try:
            from simple_rl.tasks.gym.GymMDPClass import GymMDP
        except:
            raise ValueError("(simple_rl) Error: OpenAI gym not installed.")
        return GymMDP(env_name, render=True)
    else:
        return {"grid":GridWorldMDP(5, 5, (1, 1), goal_locs=[(5, 3), (4,1)]),
                "four_room":FourRoomMDP(),
                "chain":ChainMDP(5),
                "taxi":TaxiOOMDP(10, 10, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers),
                "random":RandomMDP(num_states=40, num_rand_trans=20),
                "prison":PrisonersDilemmaMDP(),
                "rps":RockPaperScissorsMDP(),
                "grid_game":GridGameMDP(),
                "multi":{0.5:RandomMDP(num_states=40, num_rand_trans=20), 0.5:RandomMDP(num_states=40, num_rand_trans=5)}}[mdp_name]
예제 #3
0
def make_mdp(mdp_class="grid", grid_dim=7):
    '''
    Returns:
        (MDP)
    '''
    # Grid/Hallway stuff.
    width, height = grid_dim, grid_dim
    hall_goal_locs = [(i, width) for i in range(1, height + 1)]

    four_room_goal_locs = [(width, height), (width, 1), (1, height),
                           (1, height - 2), (width - 2, height - 2),
                           (width - 2, 1)]
    four_room_goal_loc = four_room_goal_locs[5]

    # Taxi stuff.
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{
        "x": grid_dim / 2,
        "y": grid_dim / 2,
        "dest_x": grid_dim - 2,
        "dest_y": 2,
        "in_taxi": 0
    }]
    walls = []

    mdp = {
        "hall":
        GridWorldMDP(width=width,
                     height=height,
                     init_loc=(1, 1),
                     goal_locs=hall_goal_locs),
        "pblocks_grid":
        make_grid_world_from_file("pblocks_grid.txt", randomize=True),
        "grid":
        GridWorldMDP(width=width,
                     height=height,
                     init_loc=(1, 1),
                     goal_locs=[(grid_dim, grid_dim)]),
        "four_room":
        FourRoomMDP(width=width, height=height,
                    goal_locs=[four_room_goal_loc]),
        "chain":
        ChainMDP(num_states=grid_dim),
        "random":
        RandomMDP(num_states=50, num_rand_trans=2),
        "hanoi":
        HanoiMDP(num_pegs=grid_dim, num_discs=3),
        "taxi":
        TaxiOOMDP(width=grid_dim,
                  height=grid_dim,
                  slip_prob=0.0,
                  agent=agent,
                  walls=walls,
                  passengers=passengers)
    }[mdp_class]

    return mdp
예제 #4
0
def main():

    # Make MDP.
    grid_dim = 11
    mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), slip_prob=0.05, goal_locs=[(grid_dim, grid_dim)], gamma=0.99)

    # Experiment Type.
    exp_type = "learn_w_abstr"

    # For comparing policies and visualizing.
    beta = 1
    is_deterministic_ib = True
    is_agent_in_control = True

    # For main plotting experiment.
    beta_range = list(chart_utils.drange(0.0, 4.0, 1.0))
    instances = 1

    # Get demo policy.
    vi = ValueIteration(mdp)
    _, val = vi.run_vi()

    # Epsilon greedy policy
    demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.1))

    if exp_type == "plot_info_sa_val_and_num_states":
        # Makes the main two plots.
        make_info_sa_val_and_size_plots(mdp, demo_policy, beta_range, instances=instances, is_agent_in_control=is_agent_in_control)
    elif exp_type == "compare_policies":
        # Makes a plot comparing value of pi-phi combo from info_sa with \pi_d.
        info_sa_compare_policies(mdp, demo_policy, beta=beta, is_deterministic_ib=is_deterministic_ib, is_agent_in_control=is_agent_in_control)
    elif exp_type == "visualize_info_sa_abstr":
        # Visualize the state abstraction found by info_sa.
        info_sa_visualize_abstr(mdp, demo_policy, beta=beta, is_deterministic_ib=is_deterministic_ib, is_agent_in_control=is_agent_in_control)
    elif exp_type == "learn_w_abstr":
        # Run learning experiments for different settings of \beta.
        learn_w_abstr(mdp, demo_policy, is_deterministic_ib=is_deterministic_ib)
    elif exp_type == "planning":
        info_sa_planning_experiment()
예제 #5
0
def main(open_plot=True):
    # Setup MDP, Agents.
    # mdp = GridWorldMDP(width=4, height=3, init_loc=(1,1), goal_locs=[(4,3)], gamma=0.95, walls=[(2,2)])
    mdp = FourRoomMDP(width=11,
                      height=11,
                      init_loc=(1, 1),
                      goal_locs=[(9, 3)],
                      is_goal_terminal=True,
                      slip_prob=0.2)
    # mdp = ComboLockMDP(combo=[3,1,2], num_actions=3, num_states=3)

    dq_agent = DoubleQAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    run_agents_on_mdp([ql_agent, rand_agent],
                      mdp,
                      instances=5,
                      episodes=1,
                      steps=10000,
                      open_plot=open_plot)
예제 #6
0
def make_mdp(mdp_class="grid", state_size=7):
    '''
    Returns:
        (MDP)
    '''
    # Grid/Hallway stuff.
    width, height = state_size, state_size
    hall_goal_locs = [(i, width) for i in range(1, height + 1)]

    # Taxi stuff.
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    passengers = [{
        "x": state_size / 2,
        "y": state_size / 2,
        "dest_x": state_size - 2,
        "dest_y": 2,
        "in_taxi": 0
    }]
    walls = []

    mdp = {
        "hall":
        GridWorldMDP(width=width,
                     height=height,
                     init_loc=(1, 1),
                     goal_locs=hall_goal_locs),
        "pblocks_grid":
        make_grid_world_from_file("pblocks_grid.txt", randomize=True),
        "grid":
        GridWorldMDP(width=width,
                     height=height,
                     init_loc=(1, 1),
                     goal_locs=[(state_size, state_size)]),
        "four_room":
        FourRoomMDP(width=width, height=height, goal_locs=[(width, height)]),
        "chain":
        ChainMDP(num_states=state_size),
        "random":
        RandomMDP(num_states=50, num_rand_trans=2),
        "taxi":
        TaxiOOMDP(width=state_size,
                  height=state_size,
                  slip_prob=0.0,
                  agent=agent,
                  walls=walls,
                  passengers=passengers)
    }[mdp_class]

    return mdp
예제 #7
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95)
    ql_agent = QLearnerAgent(mdp.get_actions())

    viz = parse_args()

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        vi = ValueIteration(mdp)
        vi.run_vi()
        policy = vi.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print "\n", str(ql_agent), "interacting with", str(mdp)
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
예제 #8
0
def make_mdp_distr(mdp_class="grid", num_mdps=15, gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        num_mdps (int)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    mdp_prob = 1.0 / num_mdps
    height, width = 10, 10

    # Make @num_mdps MDPs.
    for i in xrange(num_mdps):
        next_goals = rnd.sample([(1, 7), (7, 1), (7, 7), (6, 6), (6, 1),
                                 (1, 6)], 2)
        new_mdp = {
            "grid":
            GridWorldMDP(width=width,
                         height=height,
                         init_loc=(1, 1),
                         goal_locs=rnd.sample(
                             zip(range(1, width + 1), [height] * width), 1),
                         is_goal_terminal=True,
                         gamma=gamma),
            "four_room":
            FourRoomMDP(width=8, height=8, goal_locs=next_goals, gamma=gamma),
            "chain":
            ChainMDP(num_states=10,
                     reset_val=rnd.choice([0, 0.01, 0.05, 0.1]),
                     gamma=gamma),
            "random":
            RandomMDP(num_states=40,
                      num_rand_trans=rnd.randint(1, 10),
                      gamma=gamma)
        }[mdp_class]

        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict)
예제 #9
0
def make_mdp_distr(mdp_class="grid", grid_dim=9, horizon=0, step_cost=0, gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)
        step_cost (float)
        gamma (float)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    height, width = grid_dim, grid_dim

    # Define goal locations.
        
    # Corridor.
    corr_width = 20
    corr_goal_magnitude = 1 #random.randint(1, 5)
    corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude + 1)] + [j for j in xrange(corr_width-corr_goal_magnitude + 1, corr_width + 1)]
    corr_goal_locs  = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    tl_grid_world_rows, tl_grid_world_cols = [i for i in xrange(width - 4, width)], [j for j in xrange(height - 4, height)]
    tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols))
    tr_grid_world_rows, tr_grid_world_cols = [i for i in xrange(1, 4)], [j for j in xrange(height - 4, height)]
    tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols))
    grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs

    # Hallway.
    upworld_goal_locs = [(i, height) for i in xrange(1, 30)]

    # Four room.
    four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)]

    print four_room_goal_locs
                            
    tight_four_room_goal_locs = [(width, height), (width, height-1), (width-1, height), (width, height - 2), (width - 2, height), (width-1, height-1)]

    # Taxi.
    agent = {"x":1, "y":1, "has_passenger":0}
    walls = []

    goal_loc_dict = {"four_room":four_room_goal_locs,
                    "color":four_room_goal_locs,
                    "upworld":upworld_goal_locs,
                    "grid":grid_goal_locs,
                    "corridor":corr_goal_locs,
                    "tight_four_room":tight_four_room_goal_locs,
                    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(goal_loc_dict[mdp_class])
    if mdp_class == "octo":
        num_mdps = 12
    mdp_prob = 1.0 / num_mdps

    for i in xrange(num_mdps):

        new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False),
                    "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i),
                    "upworld":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["upworld"], name="upworld", is_goal_terminal=True),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"),
                    "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True),
                    "color":ColorMDP(width=width, height=height, num_colors=4, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True),
                    "tight_four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["tight_four_room"][i % len(goal_loc_dict["tight_four_room"])]], is_goal_terminal=True, name="tight_four_room")}[mdp_class]

        new_mdp.set_step_cost(step_cost)
        new_mdp.set_gamma(gamma)
        
        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
예제 #10
0
def make_mdp_distr(mdp_class, is_goal_terminal, mdp_size=11, horizon=0, gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)
        step_cost (float)
        gamma (float)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}

    height, width, = mdp_size, mdp_size

    # Corridor.
    corr_width = 20
    corr_goal_magnitude = 1 #random.randint(1, 5)
    corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [j for j in range(corr_width-corr_goal_magnitude + 1, corr_width + 1)]
    corr_goal_locs  = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    tl_grid_world_rows, tl_grid_world_cols = [i for i in range(width - 4, width)], [j for j in range(height - 4, height)]
    tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols))
    tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [j for j in range(height - 4, height)]
    tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols))
    grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs

    # Four room.
    four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)]

    # SPREAD vs. TIGHT
    spread_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1), (2,2)]
    tight_goal_locs = [(width, height), (width-1, height), (width, height-1), (width, height - 2), (width - 2, height), (width - 1, height-1), (width-2,height-2)]

    changing_entities = {"four_room":four_room_goal_locs,
                    "grid":grid_goal_locs,
                    "corridor":corr_goal_locs,
                    "spread":spread_goal_locs,
                    "tight":tight_goal_locs,
                    "chain":[0.0, 0.01, 0.1, 0.5, 1.0],
                    "combo_lock":[[3,1,2],[3,2,1],[2,3,1],[3,3,1]],
                    "walls":make_wall_permutations(mdp_size),
                    "lava":make_lava_permutations(mdp_size)
                    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in changing_entities.keys() else len(changing_entities[mdp_class])
    if mdp_class == "octo":
        num_mdps = 12
    mdp_prob = 1.0 / num_mdps

    for i in range(num_mdps):

        new_mdp = {"chain":ChainMDP(reset_val=changing_entities["chain"][i%len(changing_entities["chain"])]),
                   # "lava":GridWorldMDP(width=width, height=height, rand_init=False, step_cost=-0.001, lava_cost=0.0, lava_locs=changing_entities["lava"][i%len(changing_entities["lava"])], goal_locs=[(mdp_size-3, mdp_size-3)], is_goal_terminal=is_goal_terminal, name="lava_world", slip_prob=0.1),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[changing_entities["four_room"][i % len(changing_entities["four_room"])]], is_goal_terminal=is_goal_terminal),
                   # "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[changing_entities["corridor"][i % len(changing_entities["corridor"])]], is_goal_terminal=is_goal_terminal, name="corridor"),
                    "combo_lock":ComboLockMDP(combo=changing_entities["combo_lock"][i%len(changing_entities["combo_lock"])]),
                    "spread":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["spread"][i % len(changing_entities["spread"])]], is_goal_terminal=is_goal_terminal, name="spread_grid"),
                    "tight":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["tight"][i % len(changing_entities["tight"])]], is_goal_terminal=is_goal_terminal, name="tight_grid"),
                    }[mdp_class]

        new_mdp.set_gamma(gamma)
        
        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
예제 #11
0
def branching_factor_experiment(min_options=0,
                                max_options=20,
                                increment=2,
                                instances=5,
                                epsilon=0.05):
    '''
    Args:
        min_options (int)
        max_options (int)
        increment (int)

    Summary:
        Runs an experiment contrasting learning performance for different # options.
    '''
    # Define MDP.
    grid_size = 7
    mdp = FourRoomMDP(width=grid_size,
                      height=grid_size,
                      goal_locs=[(grid_size, grid_size)])

    # Make State Abstraction.
    states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50)
    state_abstr = core.compute_phi_given_m(mdp,
                                           four_rooms_predicate_9x9,
                                           level=1,
                                           states=states)

    x_axis = range(min_options, max_options + 1, increment)
    y_axis = defaultdict(list)  #[] #[0] * len(x_axis)
    conf_intervals = defaultdict(list)
    num_options_performance = defaultdict(lambda: defaultdict(list))

    # Choose dependent variable (either #steps per episode or #episodes).
    d_var_range = [(20, 5), (40, 250), (400, 2500)]

    for steps, episodes in d_var_range:
        print "steps, episodes", steps, episodes

        # Evaluate.
        for i, instance in enumerate(range(instances)):
            print "\tInstance", instance + 1, "of", str(instances) + "."

            # Make initial Options.
            for num_options in x_axis:

                options, _ = make_near_optimal_phi_relative_options(
                    mdp,
                    state_abstr,
                    'eps-greedy',
                    num_rand_opts=num_options - 1,
                    eps=epsilon)
                action_abstr = ActionAbstraction(
                    options=options, prim_actions=mdp.get_actions())

                # Make agent.
                AgentClass = RMaxAgent  # DoubleQAgent, QLearningAgent, SarsaAgent
                sa_aa_agent = AbstractionWrapper(
                    AgentClass,
                    agent_params={"actions": mdp.get_actions()},
                    state_abstr=state_abstr,
                    action_abstr=action_abstr,
                    name_ext="-$\\phi,O$")

                _, _, value_per_episode = run_single_agent_on_mdp(
                    sa_aa_agent, mdp, episodes=episodes, steps=steps)
                mdp.reset()

                num_options_performance[(steps, episodes)][num_options].append(
                    value_per_episode[-1])

    ############
    # Other types

    # Just state abstraction.
    steps, episodes = d_var_range[-1][0], d_var_range[-1][1]
    sa_agent = AbstractionWrapper(AgentClass,
                                  agent_params={"actions": mdp.get_actions()},
                                  state_abstr=state_abstr,
                                  action_abstr=None,
                                  name_ext="-$\\phi$")
    _, _, value_per_episode = run_single_agent_on_mdp(sa_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["phi"].append(
                                 value_per_episode[-1])
    y_axis["phi"] = [value_per_episode[-1]]

    # Run random options.
    options = make_fixed_random_options(mdp, state_abstr)
    action_abstr = ActionAbstraction(options=options,
                                     prim_actions=mdp.get_actions())
    AgentClass = QLearningAgent
    rand_opt_agent = AbstractionWrapper(
        AgentClass,
        agent_params={"actions": mdp.get_actions()},
        state_abstr=state_abstr,
        action_abstr=action_abstr,
        name_ext="-$\\phi,O_{\text{random}}$")
    _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    num_options_performance[(d_var_range[-1][0],
                             d_var_range[-1][1])]["random"].append(
                                 value_per_episode[-1])
    y_axis["random"] = [value_per_episode[-1]]

    # Makeoptimal agent.
    value_iter = ValueIteration(mdp)
    value_iter.run_vi()
    optimal_agent = FixedPolicyAgent(value_iter.policy)
    _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent,
                                                      mdp,
                                                      episodes=episodes,
                                                      steps=steps)
    y_axis["optimal"] = [value_per_episode[-1]]
    total_steps = d_var_range[0][0] * d_var_range[0][1]

    # Confidence intervals.
    for dependent_var in d_var_range:
        for num_options in x_axis:
            # Compute mean and standard error.
            avg_for_n = float(
                sum(num_options_performance[dependent_var]
                    [num_options])) / instances
            std_deviation = np.std(
                num_options_performance[dependent_var][num_options])
            std_error = 1.96 * (std_deviation / math.sqrt(
                len(num_options_performance[dependent_var][num_options])))
            y_axis[dependent_var].append(avg_for_n)
            conf_intervals[dependent_var].append(std_error)

    plt.xlabel("$|O_\\phi|$")
    plt.xlim([1, len(x_axis)])
    plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$")
    plt.tight_layout()  # Keeps the spacing nice.

    # Add just state abstraction.
    ep_val_del_q_phi = y_axis["phi"]
    label = "$O_{\\phi}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis),
             marker="+",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[-1],
             label=label)

    # Add random options.
    ep_val_del_q = y_axis["random"]
    label = "$O_{random}$"  #" N=1e" + str(str(total_steps).count("0")) + "$"
    plt.plot(x_axis, [ep_val_del_q] * len(x_axis),
             marker="x",
             linestyle="--",
             linewidth=1.0,
             color=PLOT_COLORS[0])  #, label=label)

    # Add optimal.
    ep_val_optimal = y_axis["optimal"]
    plt.plot(x_axis, [ep_val_optimal] * len(x_axis),
             linestyle="-",
             linewidth=1.0,
             color=PLOT_COLORS[1])  #, label="$\\pi^*$")

    for i, dependent_var in enumerate(d_var_range):
        total_steps = dependent_var[0] * dependent_var[1]
        label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str(
            str(total_steps).count("0")) + "$"
        plt.plot(x_axis,
                 y_axis[dependent_var],
                 marker="x",
                 color=PLOT_COLORS[i + 2],
                 linewidth=1.5,
                 label=label)

        # Confidence intervals.
        top = np.add(y_axis[dependent_var], conf_intervals[dependent_var])
        bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var])
        plt.fill_between(x_axis,
                         top,
                         bot,
                         alpha=0.25,
                         color=PLOT_COLORS[i + 2])

    plt.legend()
    plt.savefig("branching_factor_results.pdf", format="pdf")
    plt.cla()
    plt.close()
예제 #12
0
def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0):
    '''
    Args:
        min_grid_size (int)
        max_grid_size (int)
        beta (float): Hyperparameter for InfoSA.

    Summary:
        Writes num iterations and time (seconds) for planning with and without abstractions.
    '''
    vanilla_file = "vi.csv"
    sa_file = "vi-$\\phi$.csv"
    file_prefix = os.path.join("results", "planning-four_room")
    
    clear_files(dir_name=file_prefix)

    for grid_dim in xrange(min_grid_size, max_grid_size + 1):
        # ======================
        # == Make Environment ==
        # ======================
        mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9)
        
        # Get demo policy.
        vi = ValueIteration(mdp)
        vi.run_vi()
        demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2))

        # =======================
        # == Make Abstractions ==
        # =======================
        pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001)
        lambda_abstr_policy = get_lambda_policy(abstr_policy)
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)

        # ============
        # == Run VI ==
        # ============
        vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25)
        sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25)

        # ==========
        # == Plan ==
        # ==========
        print "Running VIs."
        start_time = time.clock()
        vanilla_iters, vanilla_val = vanilla_vi.run_vi()
        vanilla_time = round(time.clock() - start_time, 2)

        mdp.reset()
        start_time = time.clock()
        sa_iters, sa_abs_val = sa_vi.run_vi()
        sa_time = round(time.clock() - start_time, 2)
        sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25)

        print "\n" + "*"*20
        print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time
        print 
        print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time
        print "*"*20 + "\n\n"

        write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters)
        write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters)

        write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time)
        write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)
def main():

    # Setup MDP, Agents.
    mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4)
    viz = parse_args()

    # Choose viz type.
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
    elif viz == "interactive":
        mdp.visualize_interaction()
예제 #14
0
def run_learning_experiment():
    """
    Summary:
        Builds different sets of options and contrasts how RL algorithms
        perform when learning with them.
    """
    # Define MDP.
    width, height = 11, 11
    mdp = FourRoomMDP(width=width,
                      height=height,
                      goal_locs=[(width, height)],
                      slip_prob=0.05)
    actions = mdp.get_actions()

    # Make State Abstraction.
    states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50)
    if isinstance(mdp, FourRoomMDP):
        predicate = four_rooms_predicate_11x11
    else:
        predicate = reachable_in_n_steps_predicate

    state_abstr = core.compute_phi_given_m(mdp,
                                           predicate,
                                           level=1,
                                           states=states)

    # Make initial Options.
    num_rand_opts_to_add = 2
    options, _ = make_near_optimal_phi_relative_options(
        mdp,
        state_abstr,
        'eps-greedy',
        num_rand_opts=num_rand_opts_to_add,
        eps=0.05)
    action_abstr = ActionAbstraction(options=options, prim_actions=actions)
    action_abstr_w_prims = ActionAbstraction(options=options,
                                             prim_actions=actions,
                                             incl_primitives=True)

    # Find eigen options.
    # num_eigen_options = max(1, num_rand_opts_to_add - 1)
    # eigen_options_init_all = find_eigenoptions(mdp, num_options=num_eigen_options, init_everywhere=True)
    # eigen_options_w_prims = find_eigenoptions(mdp, num_options=num_eigen_options, init_everywhere=False)
    # eigen_aa_init_all = ActionAbstraction(options=eigen_options_init_all, prim_actions=actions, incl_primitives=False)
    # eigen_aa_w_prims = ActionAbstraction(options=eigen_options_w_prims, prim_actions=actions, incl_primitives=True)

    # Make agent.
    AgentClass = QLearningAgent  #QLearningAgent #DoubleQAgent #DelayedQAgent
    ql_agent = AgentClass(mdp.get_actions())
    sa_aa_agent = AbstractionWrapper(AgentClass,
                                     agent_params={"actions": actions},
                                     state_abstr=state_abstr,
                                     action_abstr=action_abstr_w_prims,
                                     name_ext="-$\\phi,O$")
    aa_agent = AbstractionWrapper(AgentClass,
                                  agent_params={"actions": actions},
                                  state_abstr=None,
                                  action_abstr=action_abstr_w_prims,
                                  name_ext="-$O$")
    # aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=action_abstr_w_prims, name_ext="-$\\phi$")
    # Eigen agents.
    # eigen_agent_init_all = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=eigen_aa_init_all, name_ext="-eigen_all")
    # eigen_agent_w_prims = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=eigen_aa_w_prims, name_ext="-eigen_w_prims")
    agents = [ql_agent, aa_agent,
              sa_aa_agent]  #, eigen_agent_init_all, eigen_agent_w_prims]

    # Run.
    if isinstance(mdp, FourRoomMDP):
        run_agents_on_mdp(agents, mdp, instances=10, episodes=500, steps=50)
    else:
        run_agents_on_mdp(agents, mdp, instances=10, episodes=100, steps=10)
예제 #15
0
def make_mdp_distr(mdp_class="grid", grid_dim=7, horizon=0):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    height, width = grid_dim, grid_dim

    # Define goal locations.

    # Corridor.
    corr_width = 20
    corr_goal_magnitude = random.randint(1, 5)
    corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude)] + [
        j for j in xrange(corr_width - corr_goal_magnitude, corr_width + 1)
    ]
    corr_goal_locs = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    grid_world_rows, grid_world_cols = [i for i in xrange(width - 4, width)], [
        j for j in xrange(height - 4, height)
    ]
    grid_goal_locs = list(itertools.product(grid_world_rows, grid_world_cols))

    # Hallway.
    hall_goal_locs = [(i, width) for i in range(1, height + 1)]

    # Four room.
    four_room_goal_locs = [(2, 2), (width, height), (width, 1), (1, height)]

    # Taxi.
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    walls = []

    goal_loc_dict = {
        "four_room": four_room_goal_locs,
        "hall": hall_goal_locs,
        "grid": grid_goal_locs,
        "corridor": corr_goal_locs
    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(
        goal_loc_dict[mdp_class])
    mdp_prob = 1.0 / num_mdps

    for i in range(num_mdps):

        new_mdp = {"hall":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["hall"][i % len(goal_loc_dict["hall"])]]),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True),
                    "grid":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]]),
                    # THESE GOALS ARE SPECIFIED IMPLICITLY:
                    "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True),
                    "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])),
                    "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)),
                    "taxi":TaxiOOMDP(4, 4, slip_prob=0.0, agent=agent, walls=walls, \
                                    passengers=[{"x":2, "y":2, "dest_x":random.randint(1,4), "dest_y":random.randint(1,4), "in_taxi":0}])}[mdp_class]

        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
예제 #16
0
                s = torch.tensor((x, y)).float()
                action = torch.argmax(
                    alg.action_distr(s, torch.tensor(skill)).logits).item()
                action_word = dict(enumerate(alg.env.action_map)).get(
                    action, 'term')
                policy[x, y] = action_word
    return policy


plt.figure()

#%%
reset_seeds(0)
from simple_rl.tasks import FourRoomMDP
from notebooks.simple_rl_env import SimpleGymEnv
env = SimpleGymEnv(FourRoomMDP(12, 12, goal_locs=[(12, 12)]))
env.render()
s0 = torch.as_tensor(env.reset(), dtype=torch.float32)
ndim_s = len(env.observation_space)
n_actions = env.action_space.n
n_skills = 40
gamma = 0.99
max_steps_per_skill = 10
n_units = 32
lr = 1e-3
alg = VIC
# alg = DIAYN
alg = alg(env, ndim_s, n_actions, n_skills, gamma, max_steps_per_skill,
          n_units, lr)
start_policies = [get_policy(alg, env.mdp, s) for s in range(n_skills)]
#%%
예제 #17
0
def make_mdp_distr(mdp_class="grid",
                   grid_dim=9,
                   horizon=0,
                   step_cost=0,
                   gamma=0.99):
    '''
    Args:
        mdp_class (str): one of {"grid", "random"}
        horizon (int)
        step_cost (float)
        gamma (float)

    Returns:
        (MDPDistribution)
    '''
    mdp_dist_dict = {}
    height, width = grid_dim, grid_dim

    # Define goal locations.

    # Corridor.
    corr_width = 20
    corr_goal_magnitude = 1  #random.randint(1, 5)
    corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [
        j for j in range(corr_width - corr_goal_magnitude + 1, corr_width + 1)
    ]
    corr_goal_locs = list(itertools.product(corr_goal_cols, [1]))

    # Grid World
    tl_grid_world_rows, tl_grid_world_cols = [
        i for i in range(width - 4, width)
    ], [j for j in range(height - 4, height)]
    tl_grid_goal_locs = list(
        itertools.product(tl_grid_world_rows, tl_grid_world_cols))
    tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [
        j for j in range(height - 4, height)
    ]
    tr_grid_goal_locs = list(
        itertools.product(tr_grid_world_rows, tr_grid_world_cols))
    grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs

    # Hallway.
    hall_goal_locs = [(i, height) for i in range(1, 30)]

    # Four room.
    four_room_goal_locs = [(width, height), (width, 1), (1, height),
                           (1, height - 2),
                           (width - 2, height - 2)]  #, (width - 2, 1)]

    # Taxi.
    agent = {"x": 1, "y": 1, "has_passenger": 0}
    walls = []

    goal_loc_dict = {
        "four_room": four_room_goal_locs,
        "hall": hall_goal_locs,
        "grid": grid_goal_locs,
        "corridor": corr_goal_locs,
    }

    # MDP Probability.
    num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(
        goal_loc_dict[mdp_class])
    if mdp_class == "octo":
        num_mdps = 12
    mdp_prob = 1.0 / num_mdps

    for i in range(num_mdps):

        new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False),
                    "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i),
                    "hall":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["hall"], name="hallway", is_goal_terminal=True),
                    "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"),
                    "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True),
                    "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True),
                    # THESE GOALS ARE SPECIFIED IMPLICITLY:
                    "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True, slip_prob=0.1),
                    "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])),
                    "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)),
                    "taxi":TaxiOOMDP(3, 4, slip_prob=0.0, agent=agent, walls=walls, \
                                    passengers=[{"x":2, "y":1, "dest_x":random.choice([2,3]), "dest_y":random.choice([2,3]), "in_taxi":0},
                                                {"x":1, "y":2, "dest_x":random.choice([1,2]), "dest_y":random.choice([1,4]), "in_taxi":0}])}[mdp_class]

        new_mdp.set_step_cost(step_cost)
        new_mdp.set_gamma(gamma)

        mdp_dist_dict[new_mdp] = mdp_prob

    return MDPDistribution(mdp_dist_dict, horizon=horizon)
예제 #18
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01)
    # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) 
    rm_agent = RMaxAgent(mdp.get_actions())
    viz = parse_args()
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
예제 #19
0
    pass

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)

from simple_rl.agents import QLearningAgent, RandomAgent, RMaxAgent
from simple_rl.planning import ValueIteration
from simple_rl.tasks import GridWorldMDP
from simple_rl.run_experiments import run_agents_on_mdp

# # Setup MDP.
# mdp = GridWorldMDP(width=6, height=6, init_loc=(1, 1), goal_locs=[(6, 6)])
#
# # Setup Agents.
# ql_agent = QLearningAgent(actions=mdp.get_actions())
# rand_agent = RandomAgent(actions=mdp.get_actions())
# rmax_agent = RMaxAgent(actions=mdp.get_actions(), horizon=3, s_a_threshold=1)
#
# # Run experiment and make plot.
# run_agents_on_mdp([ql_agent, rand_agent, rmax_agent], mdp, instances=5, episodes=100, steps=40, reset_at_terminal=True,
#                   verbose=False)

from simple_rl.tasks import FourRoomMDP
from simple_rl.tasks.grid_world import grid_visualizer

four_room_mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9), (5, 4)], gamma=0.95)

# Run experiment and make plot.
# four_room_mdp.visualize_value()
four_room_mdp.visualize_interaction()
# four_room_mdp.visualize_policy()