def test_tree_debugger_tiger(debug_tree=False): tiger_problem = TigerProblem.create("tiger-left", 0.5, 0.15) pouct = pomdp_py.POUCT(max_depth=4, discount_factor=0.95, num_sims=4096, exploration_const=200, rollout_policy=tiger_problem.agent.policy_model) pouct.plan(tiger_problem.agent) dd = TreeDebugger(tiger_problem.agent.tree) # The number of VNodes equals to the sum of VNodes per layer assert dd.nv == sum([len(dd.l(i)) for i in range(dd.nl)]) # The total number of nodes equal to the number of VNodes plus QNodes assert dd.nn == dd.nv + dd.nq # Test example usage dd.mark(dd.path(dd.layer(2)[0])) print("Printing tree up to depth 1") dd.p(1) # There exists a path from the root to nodes in the tree for i in range(dd.nl): n = dd.l(i)[0] path = dd.path_to(n) assert path is not None test_planner(tiger_problem, pouct, nsteps=3, debug_tree=debug_tree)
def main(): init_true_state = random.choice( [State("tiger-left"), State("tiger-right")]) init_belief = pomdp_py.Histogram({ State("tiger-left"): 0.5, State("tiger-right"): 0.5 }) tiger_problem = TigerProblem( 0.15, # observation noise init_true_state, init_belief) print("** Testing value iteration **") vi = pomdp_py.ValueIteration(horizon=3, discount_factor=0.95) test_planner(tiger_problem, vi, nsteps=3) # Reset agent belief tiger_problem.agent.set_belief(init_belief, prior=True) print("\n** Testing POUCT **") pouct = pomdp_py.POUCT(max_depth=3, discount_factor=0.95, num_sims=4096, exploration_const=200, rollout_policy=tiger_problem.agent.policy_model) test_planner(tiger_problem, pouct, nsteps=10) pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree, max_depth=5, anonymize=False) # Reset agent belief tiger_problem.agent.set_belief(init_belief, prior=True) tiger_problem.agent.tree = None print("** Testing POMCP **") tiger_problem.agent.set_belief(pomdp_py.Particles.from_histogram( init_belief, num_particles=100), prior=True) pomcp = pomdp_py.POMCP(max_depth=3, discount_factor=0.95, num_sims=1000, exploration_const=200, rollout_policy=tiger_problem.agent.policy_model) test_planner(tiger_problem, pomcp, nsteps=10) pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree, max_depth=5, anonymize=False)
def main(): init_true_state = random.choice( [TigerState("tiger-left"), TigerState("tiger-right")]) init_belief = pomdp_py.Histogram({ TigerState("tiger-left"): 0.5, TigerState("tiger-right"): 0.5 }) tiger_problem = TigerProblem( 0.15, # observation noise init_true_state, init_belief) print("** Testing value iteration **") vi = pomdp_py.ValueIteration(horizon=3, discount_factor=0.95) test_planner(tiger_problem, vi, nsteps=3) # Reset agent belief tiger_problem.agent.set_belief(init_belief, prior=True) print("\n** Testing POUCT **") pouct = pomdp_py.POUCT(max_depth=3, discount_factor=0.95, num_sims=4096, exploration_const=50, rollout_policy=tiger_problem.agent.policy_model, show_progress=True) test_planner(tiger_problem, pouct, nsteps=10) TreeDebugger(tiger_problem.agent.tree).pp # Reset agent belief tiger_problem.agent.set_belief(init_belief, prior=True) tiger_problem.agent.tree = None print("** Testing POMCP **") tiger_problem.agent.set_belief(pomdp_py.Particles.from_histogram( init_belief, num_particles=100), prior=True) pomcp = pomdp_py.POMCP(max_depth=3, discount_factor=0.95, num_sims=1000, exploration_const=50, rollout_policy=tiger_problem.agent.policy_model, show_progress=True, pbar_update_interval=500) test_planner(tiger_problem, pomcp, nsteps=10) TreeDebugger(tiger_problem.agent.tree).pp
def solve( problem, max_depth=10, # planning horizon discount_factor=0.99, planning_time=1., # amount of time (s) to plan each step exploration_const=1000, # exploration constant visualize=True, max_time=120, # maximum amount of time allowed to solve the problem max_steps=500): # maximum number of planning steps the agent can take. """ This function terminates when: - maximum time (max_time) reached; This time includes planning and updates - agent has planned `max_steps` number of steps - agent has taken n FindAction(s) where n = number of target objects. Args: visualize (bool) if True, show the pygame visualization. """ random_objid = random.sample(problem.env.target_objects, 1)[0] random_object_belief = problem.agent.belief.object_beliefs[random_objid] if isinstance(random_object_belief, pomdp_py.Histogram): # Use POUCT planner = pomdp_py.POUCT( max_depth=max_depth, discount_factor=discount_factor, planning_time=planning_time, exploration_const=exploration_const, rollout_policy=problem.agent.policy_model) # Random by default elif isinstance(random_object_belief, pomdp_py.Particles): # Use POMCP planner = pomdp_py.POMCP( max_depth=max_depth, discount_factor=discount_factor, planning_time=planning_time, exploration_const=exploration_const, rollout_policy=problem.agent.policy_model) # Random by default else: raise ValueError("Unsupported object belief type %s" % str(type(random_object_belief))) robot_id = problem.agent.robot_id if visualize: viz = MosViz(problem.env, controllable=False ) # controllable=False means no keyboard control. if viz.on_init() == False: raise Exception("Environment failed to initialize") viz.update(robot_id, None, None, None, problem.agent.cur_belief) viz.on_render() _time_used = 0 _find_actions_count = 0 _total_reward = 0 # total, undiscounted reward for i in range(max_steps): # Plan action _start = time.time() real_action = planner.plan(problem.agent) _time_used += time.time() - _start if _time_used > max_time: break # no more time to update. # Execute action reward = problem.env.state_transition(real_action, execute=True, robot_id=robot_id) # Receive observation _start = time.time() real_observation = \ problem.env.provide_observation(problem.agent.observation_model, real_action) # Updates problem.agent.clear_history() # truncate history problem.agent.update_history(real_action, real_observation) belief_update(problem.agent, real_action, real_observation, problem.env.state.object_states[robot_id], planner) _time_used += time.time() - _start # Info and render _total_reward += reward if isinstance(real_action, FindAction): _find_actions_count += 1 print("==== Step %d ====" % (i + 1)) print("Action: %s" % str(real_action)) print("Observation: %s" % str(real_observation)) print("Reward: %s" % str(reward)) print("Reward (Cumulative): %s" % str(_total_reward)) print("Find Actions Count: %d" % _find_actions_count) if isinstance(planner, pomdp_py.POUCT): print("__num_sims__: %d" % planner.last_num_sims) if visualize: # This is used to show the sensing range; Not sampled # according to observation model. robot_pose = problem.env.state.object_states[robot_id].pose viz_observation = MosOOObservation({}) if isinstance(real_action, LookAction) or isinstance( real_action, FindAction): viz_observation = \ problem.env.sensors[robot_id].observe(robot_pose, problem.env.state) viz.update(robot_id, real_action, real_observation, viz_observation, problem.agent.cur_belief) viz.on_loop() viz.on_render() # Termination check if set(problem.env.state.object_states[robot_id].objects_found)\ == problem.env.target_objects: print("Done!") break if _find_actions_count >= len(problem.env.target_objects): print("FindAction limit reached.") break if _time_used > max_time: print("Maximum time reached.") break
def solve( problem, planner_type="pouct", max_depth=10, # planning horizon discount_factor=0.99, planning_time=1., # amount of time (s) to plan each step exploration_const=1000, # exploration constant visualize=True, max_time=120, # maximum amount of time allowed to solve the problem max_steps=500): # maximum number of planning steps the agent can take. if planner_type == "pouct": planner = pomdp_py.POUCT(max_depth=max_depth, discount_factor=discount_factor, planning_time=planning_time, exploration_const=exploration_const, rollout_policy=problem.agent.policy_model) else: planner = pomdp_py.POMCP(max_depth=max_depth, discount_factor=discount_factor, planning_time=planning_time, exploration_const=exploration_const, rollout_policy=problem.agent.policy_model) if visualize: viz = TagViz(problem.env, controllable=False) if viz.on_init() == False: raise Exception("Environment failed to initialize") viz.update(None, None, problem.agent.cur_belief) viz.on_render() _discount = 1.0 _time_used = 0 _find_actions_count = 0 _total_reward = 0 # total, undiscounted reward _total_discounted_reward = 0 for i in range(max_steps): # Plan action _start = time.time() real_action = planner.plan(problem.agent) _time_used += time.time() - _start if _time_used > max_time: break # no more time to update. # Execute action reward = problem.env.state_transition(real_action, execute=True) # Receive observation _start = time.time() real_observation = \ problem.env.provide_observation(problem.agent.observation_model, real_action) # Updates problem.agent.clear_history() # truncate history problem.agent.update_history(real_action, real_observation) planner.update(problem.agent, real_action, real_observation) if planner_type == "pouct": belief_update(problem.agent, real_action, real_observation) _time_used += time.time() - _start # Info and render _total_reward += reward _total_discounted_reward += reward * _discount _discount = _discount * discount_factor print("==== Step %d ====" % (i + 1)) print("Action: %s" % str(real_action)) print("Observation: %s" % str(real_observation)) print("Reward: %s" % str(reward)) print("Reward (Cumulative): %s" % str(_total_reward)) print("Reward (Discounted): %s" % str(_total_discounted_reward)) print("Find Actions Count: %d" % _find_actions_count) if isinstance(planner, pomdp_py.POUCT): print("__num_sims__: %d" % planner.last_num_sims) if visualize: viz.update(real_action, real_observation, problem.agent.cur_belief) viz.on_loop() viz.on_render() # Termination check if problem.env.state.target_found: print("Done!") break if _time_used > max_time: print("Maximum time reached.") break if _discount * 10 < 1e-4: print("Discount factor already too small") break return _total_discounted_reward
def main(): ## Setting 1: ## The values are set according to the paper. setting1 = { "obs_probs": { # next_state -> action -> observation "tiger-left":{ "open-left": {"tiger-left": 0.5, "tiger-right": 0.5}, "open-right": {"tiger-left": 0.5, "tiger-right": 0.5}, "listen": {"tiger-left": 0.85, "tiger-right": 0.15} }, "tiger-right":{ "open-left": {"tiger-left": 0.5, "tiger-right": 0.5}, "open-right": {"tiger-left": 0.5, "tiger-right": 0.5}, "listen": {"tiger-left": 0.15, "tiger-right": 0.85} } }, "trans_probs": { # state -> action -> next_state "tiger-left":{ "open-left": {"tiger-left": 0.5, "tiger-right": 0.5}, "open-right": {"tiger-left": 0.5, "tiger-right": 0.5}, "listen": {"tiger-left": 1.0, "tiger-right": 0.0} }, "tiger-right":{ "open-left": {"tiger-left": 0.5, "tiger-right": 0.5}, "open-right": {"tiger-left": 0.5, "tiger-right": 0.5}, "listen": {"tiger-left": 0.0, "tiger-right": 1.0} } } } ## Setting 2: ## Based on my understanding of T and O; Treat the state as given. setting2 = { "obs_probs": { # next_state -> action -> observation "tiger-left":{ "open-left": {"tiger-left": 1.0, "tiger-right": 0.0}, "open-right": {"tiger-left": 1.0, "tiger-right": 0.0}, "listen": {"tiger-left": 0.85, "tiger-right": 0.15} }, "tiger-right":{ "open-left": {"tiger-left": 0.0, "tiger-right": 1.0}, "open-right": {"tiger-left": 0.0, "tiger-right": 1.0}, "listen": {"tiger-left": 0.15, "tiger-right": 0.85} } }, "trans_probs": { # state -> action -> next_state "tiger-left":{ "open-left": {"tiger-left": 1.0, "tiger-right": 0.0}, "open-right": {"tiger-left": 1.0, "tiger-right": 0.0}, "listen": {"tiger-left": 1.0, "tiger-right": 0.0} }, "tiger-right":{ "open-left": {"tiger-left": 0.0, "tiger-right": 1.0}, "open-right": {"tiger-left": 0.0, "tiger-right": 1.0}, "listen": {"tiger-left": 0.0, "tiger-right": 1.0} } } } # setting1 resets the problem after the agent chooses open-left or open-right; # It's reasonable - when the agent opens one door and sees a tiger, it gets # killed; when the agent sees a treasure, it will take some of the treasure # and leave. Thus, the agent will have no incentive to close the door and do # this again. The setting2 is the case where the agent is a robot, and only # cares about getting higher reward. setting = build_setting(setting1) init_true_state = random.choice(list(TigerProblem.STATES)) init_belief = pomdp_py.Histogram({State("tiger-left"): 0.5, State("tiger-right"): 0.5}) tiger_problem = TigerProblem(setting['obs_probs'], setting['trans_probs'], init_true_state, init_belief) # Value iteration print("** Testing value iteration **") vi = pomdp_py.ValueIteration(horizon=2, discount_factor=0.99) test_planner(tiger_problem, vi, nsteps=3) # Reset agent belief tiger_problem.agent.set_belief(init_belief, prior=True) print("\n** Testing POUCT **") pouct = pomdp_py.POUCT(max_depth=10, discount_factor=0.95, num_sims=4096, exploration_const=110, rollout_policy=tiger_problem.agent.policy_model) test_planner(tiger_problem, pouct, nsteps=10) pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree, max_depth=5, anonymize=False) # Reset agent belief tiger_problem.agent.set_belief(init_belief, prior=True) tiger_problem.agent.tree = None print("** Testing POMCP **") tiger_problem.agent.set_belief(pomdp_py.Particles.from_histogram(init_belief, num_particles=100), prior=True) pomcp = pomdp_py.POMCP(max_depth=10, discount_factor=0.95, num_sims=1000, exploration_const=110, rollout_policy=tiger_problem.agent.policy_model) test_planner(tiger_problem, pomcp, nsteps=10) pomdp_py.visual.visualize_pouct_search_tree(tiger_problem.agent.tree, max_depth=5, anonymize=False)