Exemplos de POUCT em Python, exemplos de pomdp_py.POUCT em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_tree_debugger.py Projeto: zkytony/pomdp-py

def test_tree_debugger_tiger(debug_tree=False):
    tiger_problem = TigerProblem.create("tiger-left", 0.5, 0.15)
    pouct = pomdp_py.POUCT(max_depth=4,
                           discount_factor=0.95,
                           num_sims=4096,
                           exploration_const=200,
                           rollout_policy=tiger_problem.agent.policy_model)

    pouct.plan(tiger_problem.agent)
    dd = TreeDebugger(tiger_problem.agent.tree)

    # The number of VNodes equals to the sum of VNodes per layer
    assert dd.nv == sum([len(dd.l(i)) for i in range(dd.nl)])

    # The total number of nodes equal to the number of VNodes plus QNodes
    assert dd.nn == dd.nv + dd.nq

    # Test example usage
    dd.mark(dd.path(dd.layer(2)[0]))
    print("Printing tree up to depth 1")
    dd.p(1)

    # There exists a path from the root to nodes in the tree
    for i in range(dd.nl):
        n = dd.l(i)[0]
        path = dd.path_to(n)
        assert path is not None

    test_planner(tiger_problem, pouct, nsteps=3, debug_tree=debug_tree)

Exemplo n.º 2

0

Exibir arquivo

def main():
    init_true_state = random.choice(
        [State("tiger-left"), State("tiger-right")])
    init_belief = pomdp_py.Histogram({
        State("tiger-left"): 0.5,
        State("tiger-right"): 0.5
    })
    tiger_problem = TigerProblem(
        0.15,  # observation noise
        init_true_state,
        init_belief)

    print("** Testing value iteration **")
    vi = pomdp_py.ValueIteration(horizon=3, discount_factor=0.95)
    test_planner(tiger_problem, vi, nsteps=3)

    # Reset agent belief
    tiger_problem.agent.set_belief(init_belief, prior=True)

    print("\n** Testing POUCT **")
    pouct = pomdp_py.POUCT(max_depth=3,
                           discount_factor=0.95,
                           num_sims=4096,
                           exploration_const=200,
                           rollout_policy=tiger_problem.agent.policy_model)
    test_planner(tiger_problem, pouct, nsteps=10)

    pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree,
                                                max_depth=5,
                                                anonymize=False)

    # Reset agent belief
    tiger_problem.agent.set_belief(init_belief, prior=True)
    tiger_problem.agent.tree = None

    print("** Testing POMCP **")
    tiger_problem.agent.set_belief(pomdp_py.Particles.from_histogram(
        init_belief, num_particles=100),
                                   prior=True)
    pomcp = pomdp_py.POMCP(max_depth=3,
                           discount_factor=0.95,
                           num_sims=1000,
                           exploration_const=200,
                           rollout_policy=tiger_problem.agent.policy_model)
    test_planner(tiger_problem, pomcp, nsteps=10)

    pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree,
                                                max_depth=5,
                                                anonymize=False)

Exemplo n.º 3

0

Exibir arquivo

def main():
    init_true_state = random.choice(
        [TigerState("tiger-left"),
         TigerState("tiger-right")])
    init_belief = pomdp_py.Histogram({
        TigerState("tiger-left"): 0.5,
        TigerState("tiger-right"): 0.5
    })
    tiger_problem = TigerProblem(
        0.15,  # observation noise
        init_true_state,
        init_belief)

    print("** Testing value iteration **")
    vi = pomdp_py.ValueIteration(horizon=3, discount_factor=0.95)
    test_planner(tiger_problem, vi, nsteps=3)

    # Reset agent belief
    tiger_problem.agent.set_belief(init_belief, prior=True)

    print("\n** Testing POUCT **")
    pouct = pomdp_py.POUCT(max_depth=3,
                           discount_factor=0.95,
                           num_sims=4096,
                           exploration_const=50,
                           rollout_policy=tiger_problem.agent.policy_model,
                           show_progress=True)
    test_planner(tiger_problem, pouct, nsteps=10)
    TreeDebugger(tiger_problem.agent.tree).pp

    # Reset agent belief
    tiger_problem.agent.set_belief(init_belief, prior=True)
    tiger_problem.agent.tree = None

    print("** Testing POMCP **")
    tiger_problem.agent.set_belief(pomdp_py.Particles.from_histogram(
        init_belief, num_particles=100),
                                   prior=True)
    pomcp = pomdp_py.POMCP(max_depth=3,
                           discount_factor=0.95,
                           num_sims=1000,
                           exploration_const=50,
                           rollout_policy=tiger_problem.agent.policy_model,
                           show_progress=True,
                           pbar_update_interval=500)
    test_planner(tiger_problem, pomcp, nsteps=10)
    TreeDebugger(tiger_problem.agent.tree).pp

Exemplo n.º 4

0

Exibir arquivo

def solve(
    problem,
    max_depth=10,  # planning horizon
    discount_factor=0.99,
    planning_time=1.,  # amount of time (s) to plan each step
    exploration_const=1000,  # exploration constant
    visualize=True,
    max_time=120,  # maximum amount of time allowed to solve the problem
    max_steps=500):  # maximum number of planning steps the agent can take.
    """
    This function terminates when:
    - maximum time (max_time) reached; This time includes planning and updates
    - agent has planned `max_steps` number of steps
    - agent has taken n FindAction(s) where n = number of target objects.

    Args:
        visualize (bool) if True, show the pygame visualization.
    """

    random_objid = random.sample(problem.env.target_objects, 1)[0]
    random_object_belief = problem.agent.belief.object_beliefs[random_objid]
    if isinstance(random_object_belief, pomdp_py.Histogram):
        # Use POUCT
        planner = pomdp_py.POUCT(
            max_depth=max_depth,
            discount_factor=discount_factor,
            planning_time=planning_time,
            exploration_const=exploration_const,
            rollout_policy=problem.agent.policy_model)  # Random by default
    elif isinstance(random_object_belief, pomdp_py.Particles):
        # Use POMCP
        planner = pomdp_py.POMCP(
            max_depth=max_depth,
            discount_factor=discount_factor,
            planning_time=planning_time,
            exploration_const=exploration_const,
            rollout_policy=problem.agent.policy_model)  # Random by default
    else:
        raise ValueError("Unsupported object belief type %s" %
                         str(type(random_object_belief)))

    robot_id = problem.agent.robot_id
    if visualize:
        viz = MosViz(problem.env, controllable=False
                     )  # controllable=False means no keyboard control.
        if viz.on_init() == False:
            raise Exception("Environment failed to initialize")
        viz.update(robot_id, None, None, None, problem.agent.cur_belief)
        viz.on_render()

    _time_used = 0
    _find_actions_count = 0
    _total_reward = 0  # total, undiscounted reward
    for i in range(max_steps):
        # Plan action
        _start = time.time()
        real_action = planner.plan(problem.agent)
        _time_used += time.time() - _start
        if _time_used > max_time:
            break  # no more time to update.

        # Execute action
        reward = problem.env.state_transition(real_action,
                                              execute=True,
                                              robot_id=robot_id)

        # Receive observation
        _start = time.time()
        real_observation = \
            problem.env.provide_observation(problem.agent.observation_model, real_action)

        # Updates
        problem.agent.clear_history()  # truncate history
        problem.agent.update_history(real_action, real_observation)
        belief_update(problem.agent, real_action, real_observation,
                      problem.env.state.object_states[robot_id], planner)
        _time_used += time.time() - _start

        # Info and render
        _total_reward += reward
        if isinstance(real_action, FindAction):
            _find_actions_count += 1
        print("==== Step %d ====" % (i + 1))
        print("Action: %s" % str(real_action))
        print("Observation: %s" % str(real_observation))
        print("Reward: %s" % str(reward))
        print("Reward (Cumulative): %s" % str(_total_reward))
        print("Find Actions Count: %d" % _find_actions_count)
        if isinstance(planner, pomdp_py.POUCT):
            print("__num_sims__: %d" % planner.last_num_sims)

        if visualize:
            # This is used to show the sensing range; Not sampled
            # according to observation model.
            robot_pose = problem.env.state.object_states[robot_id].pose
            viz_observation = MosOOObservation({})
            if isinstance(real_action, LookAction) or isinstance(
                    real_action, FindAction):
                viz_observation = \
                    problem.env.sensors[robot_id].observe(robot_pose,
                                                          problem.env.state)
            viz.update(robot_id, real_action, real_observation,
                       viz_observation, problem.agent.cur_belief)
            viz.on_loop()
            viz.on_render()

        # Termination check
        if set(problem.env.state.object_states[robot_id].objects_found)\
           == problem.env.target_objects:
            print("Done!")
            break
        if _find_actions_count >= len(problem.env.target_objects):
            print("FindAction limit reached.")
            break
        if _time_used > max_time:
            print("Maximum time reached.")
            break

Exemplo n.º 5

0

Exibir arquivo

Arquivo: problem.py Projeto: ybubnov/pomdp-py

def solve(
    problem,
    planner_type="pouct",
    max_depth=10,  # planning horizon
    discount_factor=0.99,
    planning_time=1.,  # amount of time (s) to plan each step
    exploration_const=1000,  # exploration constant
    visualize=True,
    max_time=120,  # maximum amount of time allowed to solve the problem
    max_steps=500):  # maximum number of planning steps the agent can take.
    if planner_type == "pouct":
        planner = pomdp_py.POUCT(max_depth=max_depth,
                                 discount_factor=discount_factor,
                                 planning_time=planning_time,
                                 exploration_const=exploration_const,
                                 rollout_policy=problem.agent.policy_model)
    else:
        planner = pomdp_py.POMCP(max_depth=max_depth,
                                 discount_factor=discount_factor,
                                 planning_time=planning_time,
                                 exploration_const=exploration_const,
                                 rollout_policy=problem.agent.policy_model)
    if visualize:
        viz = TagViz(problem.env, controllable=False)
        if viz.on_init() == False:
            raise Exception("Environment failed to initialize")
        viz.update(None, None, problem.agent.cur_belief)
        viz.on_render()

    _discount = 1.0
    _time_used = 0
    _find_actions_count = 0
    _total_reward = 0  # total, undiscounted reward
    _total_discounted_reward = 0
    for i in range(max_steps):
        # Plan action
        _start = time.time()
        real_action = planner.plan(problem.agent)
        _time_used += time.time() - _start
        if _time_used > max_time:
            break  # no more time to update.

        # Execute action
        reward = problem.env.state_transition(real_action, execute=True)

        # Receive observation
        _start = time.time()
        real_observation = \
            problem.env.provide_observation(problem.agent.observation_model, real_action)

        # Updates
        problem.agent.clear_history()  # truncate history
        problem.agent.update_history(real_action, real_observation)
        planner.update(problem.agent, real_action, real_observation)
        if planner_type == "pouct":
            belief_update(problem.agent, real_action, real_observation)
        _time_used += time.time() - _start

        # Info and render
        _total_reward += reward
        _total_discounted_reward += reward * _discount
        _discount = _discount * discount_factor
        print("==== Step %d ====" % (i + 1))
        print("Action: %s" % str(real_action))
        print("Observation: %s" % str(real_observation))
        print("Reward: %s" % str(reward))
        print("Reward (Cumulative): %s" % str(_total_reward))
        print("Reward (Discounted): %s" % str(_total_discounted_reward))
        print("Find Actions Count: %d" % _find_actions_count)
        if isinstance(planner, pomdp_py.POUCT):
            print("__num_sims__: %d" % planner.last_num_sims)

        if visualize:
            viz.update(real_action, real_observation, problem.agent.cur_belief)
            viz.on_loop()
            viz.on_render()

        # Termination check
        if problem.env.state.target_found:
            print("Done!")
            break
        if _time_used > max_time:
            print("Maximum time reached.")
            break
        if _discount * 10 < 1e-4:
            print("Discount factor already too small")
            break

    return _total_discounted_reward

Exemplo n.º 6

0

Exibir arquivo

def main():
    ## Setting 1:
    ## The values are set according to the paper.
    setting1 = {
        "obs_probs": {  # next_state -> action -> observation
            "tiger-left":{ 
                "open-left": {"tiger-left": 0.5, "tiger-right": 0.5},
                "open-right": {"tiger-left": 0.5, "tiger-right": 0.5},
                "listen": {"tiger-left": 0.85, "tiger-right": 0.15}
            },
            "tiger-right":{
                "open-left": {"tiger-left": 0.5, "tiger-right": 0.5},
                "open-right": {"tiger-left": 0.5, "tiger-right": 0.5},
                "listen": {"tiger-left": 0.15, "tiger-right": 0.85}
            }
        },
        
        "trans_probs": {  # state -> action -> next_state
            "tiger-left":{ 
                "open-left": {"tiger-left": 0.5, "tiger-right": 0.5},
                "open-right": {"tiger-left": 0.5, "tiger-right": 0.5},
                "listen": {"tiger-left": 1.0, "tiger-right": 0.0}
            },
            "tiger-right":{
                "open-left": {"tiger-left": 0.5, "tiger-right": 0.5},
                "open-right": {"tiger-left": 0.5, "tiger-right": 0.5},
                "listen": {"tiger-left": 0.0, "tiger-right": 1.0}
            }
        }
    }

    ## Setting 2:
    ## Based on my understanding of T and O; Treat the state as given.
    setting2 = {
        "obs_probs": {  # next_state -> action -> observation
            "tiger-left":{ 
                "open-left": {"tiger-left": 1.0, "tiger-right": 0.0},
                "open-right": {"tiger-left": 1.0, "tiger-right": 0.0},
                "listen": {"tiger-left": 0.85, "tiger-right": 0.15}
            },
            "tiger-right":{
                "open-left": {"tiger-left": 0.0, "tiger-right": 1.0},
                "open-right": {"tiger-left": 0.0, "tiger-right": 1.0},
                "listen": {"tiger-left": 0.15, "tiger-right": 0.85}
            }
        },
        
        "trans_probs": {  # state -> action -> next_state
            "tiger-left":{ 
                "open-left": {"tiger-left": 1.0, "tiger-right": 0.0},
                "open-right": {"tiger-left": 1.0, "tiger-right": 0.0},
                "listen": {"tiger-left": 1.0, "tiger-right": 0.0}
            },
            "tiger-right":{
                "open-left": {"tiger-left": 0.0, "tiger-right": 1.0},
                "open-right": {"tiger-left": 0.0, "tiger-right": 1.0},
                "listen": {"tiger-left": 0.0, "tiger-right": 1.0}
            }
        }
    }

    # setting1 resets the problem after the agent chooses open-left or open-right;
    # It's reasonable - when the agent opens one door and sees a tiger, it gets
    # killed; when the agent sees a treasure, it will take some of the treasure
    # and leave. Thus, the agent will have no incentive to close the door and do
    # this again. The setting2 is the case where the agent is a robot, and only
    # cares about getting higher reward.
    setting = build_setting(setting1)

    init_true_state = random.choice(list(TigerProblem.STATES))
    init_belief = pomdp_py.Histogram({State("tiger-left"): 0.5,
                                      State("tiger-right"): 0.5})
    tiger_problem = TigerProblem(setting['obs_probs'],
                                 setting['trans_probs'],
                                 init_true_state, init_belief)

    # Value iteration
    print("** Testing value iteration **")
    vi = pomdp_py.ValueIteration(horizon=2, discount_factor=0.99)
    test_planner(tiger_problem, vi, nsteps=3)

    # Reset agent belief
    tiger_problem.agent.set_belief(init_belief, prior=True)

    print("\n** Testing POUCT **")
    pouct = pomdp_py.POUCT(max_depth=10, discount_factor=0.95,
                           num_sims=4096, exploration_const=110,
                           rollout_policy=tiger_problem.agent.policy_model)
    test_planner(tiger_problem, pouct, nsteps=10)

    pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree,
                                                max_depth=5, anonymize=False)

    # Reset agent belief    
    tiger_problem.agent.set_belief(init_belief, prior=True)
    tiger_problem.agent.tree = None
    
    print("** Testing POMCP **")
    tiger_problem.agent.set_belief(pomdp_py.Particles.from_histogram(init_belief, num_particles=100), prior=True)
    pomcp = pomdp_py.POMCP(max_depth=10, discount_factor=0.95,
                           num_sims=1000, exploration_const=110,
                           rollout_policy=tiger_problem.agent.policy_model)
    test_planner(tiger_problem, pomcp, nsteps=10)
    
    pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree,
                                                max_depth=5, anonymize=False)