class StochasticSAPolicy(object): def __init__(self, state_abstr, mdp): self.state_abstr = state_abstr self.mdp = mdp self.vi = ValueIteration(mdp) self.vi.run_vi() def policy(self, state): ''' Args: (simple_rl.State) Returns: (str): An action Summary: Chooses an action among the optimal actions in the cluster. That is, roughly: \pi(a \mid s_a) \sim Pr_{s_g \in s_a} (a = a^*(s_a)) ''' abstr_state = self.state_abstr.phi(state) ground_states = self.state_abstr.get_ground_states_in_abs_state( abstr_state) action_distr = defaultdict(float) for s in ground_states: a = self.vi.policy(s) action_distr[a] += 1.0 / len(ground_states) sampled_distr = np.random.multinomial(1, action_distr.values()).tolist() indices = [i for i, x in enumerate(sampled_distr) if x > 0] return action_distr.keys()[indices[0]]
def main(): args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=args.epsilon, alpha=args.alpha, explore=args.explore, anneal=args.anneal) viz = args.mode if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) rand_agent = RandomAgent(actions=mdp.get_actions()) run_agents_on_mdp([rand_agent, ql_agent], mdp, open_plot=True, episodes=60, steps=200, instances=5, success_reward=1) # mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
def main(open_plot=True): # Setup MDP. args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) if args.visualize: value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) else: custom_q = parse_custom_q_table(args.custom_q, args.default_q) agents = [] for agent in args.agents: if agent == 'q_learning': agents.append(QLearningAgent(actions=mdp.get_actions())) elif agent == 'potential_q': agents.append( QLearningAgent(actions=mdp.get_actions(), custom_q_init=custom_q, name="Potential_Q")) elif agent == 'random': agents.append(RandomAgent(actions=mdp.get_actions())) elif agent == 'rmax': agents.append(RMaxAgent(mdp.get_actions())) # Run experiment and make plot. run_agents_on_mdp(agents, mdp, instances=1, episodes=100, steps=100, open_plot=open_plot, verbose=True)