Пример #1
0
class StochasticSAPolicy(object):
    def __init__(self, state_abstr, mdp):
        self.state_abstr = state_abstr
        self.mdp = mdp
        self.vi = ValueIteration(mdp)
        self.vi.run_vi()

    def policy(self, state):
        '''
        Args:
            (simple_rl.State)

        Returns:
            (str): An action

        Summary:
            Chooses an action among the optimal actions in the cluster. That is, roughly:

                \pi(a \mid s_a) \sim Pr_{s_g \in s_a} (a = a^*(s_a))
        '''

        abstr_state = self.state_abstr.phi(state)
        ground_states = self.state_abstr.get_ground_states_in_abs_state(
            abstr_state)

        action_distr = defaultdict(float)
        for s in ground_states:
            a = self.vi.policy(s)
            action_distr[a] += 1.0 / len(ground_states)

        sampled_distr = np.random.multinomial(1,
                                              action_distr.values()).tolist()
        indices = [i for i, x in enumerate(sampled_distr) if x > 0]

        return action_distr.keys()[indices[0]]
Пример #2
0
def main():

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    ql_agent = QLearningAgent(mdp.get_actions(),
                              epsilon=args.epsilon,
                              alpha=args.alpha,
                              explore=args.explore,
                              anneal=args.anneal)
    viz = args.mode

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        rand_agent = RandomAgent(actions=mdp.get_actions())
        run_agents_on_mdp([rand_agent, ql_agent],
                          mdp,
                          open_plot=True,
                          episodes=60,
                          steps=200,
                          instances=5,
                          success_reward=1)
        # mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent,
                               delay=0.005,
                               num_ep=500,
                               num_steps=200)
def main(open_plot=True):

    # Setup MDP.

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    if args.visualize:
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))

    else:
        custom_q = parse_custom_q_table(args.custom_q, args.default_q)

        agents = []
        for agent in args.agents:
            if agent == 'q_learning':
                agents.append(QLearningAgent(actions=mdp.get_actions()))
            elif agent == 'potential_q':
                agents.append(
                    QLearningAgent(actions=mdp.get_actions(),
                                   custom_q_init=custom_q,
                                   name="Potential_Q"))
            elif agent == 'random':
                agents.append(RandomAgent(actions=mdp.get_actions()))
            elif agent == 'rmax':
                agents.append(RMaxAgent(mdp.get_actions()))

        # Run experiment and make plot.
        run_agents_on_mdp(agents,
                          mdp,
                          instances=1,
                          episodes=100,
                          steps=100,
                          open_plot=open_plot,
                          verbose=True)