Пример #1
0
    def train(self, n_steps=1e5, horizon=np.inf, eval_every=100, eval_params=None):
        """
        Train the agent. Returns estimated value function and training info.

        If horizon = np.inf, run for n_steps
        If horizon = H, number of episodes = n_steps/H

        :param n_steps:
        :param horizon:
        :param eval_every: interval for evaluating the agent
        :param eval_params: dictionary containing parameters to send to Agent.eval()
        :return:
        """
        training_info = {}
        training_info['rewards_list'] = []
        training_info['x_data'] = []
        training_info['n_episodes'] = 0
        training_info['episode_total_reward'] = []

        episode_reward = 0
        while self.t < n_steps:
            done, reward = self.step()
            episode_reward += reward

            if done or ((self.t+1) % horizon == 0):
                self.state = self.env.reset()
                training_info['n_episodes'] += 1
                training_info['episode_total_reward'].append(episode_reward)
                if self.verbose > 0 and (training_info['n_episodes'] % 500 == 0):
                    print("Episode %d, total reward = %0.2f" % (training_info['n_episodes'], episode_reward))
                episode_reward = 0

            if self.verbose > 0:
                if (self.t+1) % 1000 == 0:
                    print("Q-learning iteration %d out of %d" % (self.t+1, n_steps))

            self.t += 1

            if self.t % eval_every == 0:
                self.policy = FinitePolicy.from_q_function(self.Q, self.env)
                if eval_params is None:
                    rewards = self.eval()
                else:
                    rewards = self.eval(**eval_params)
                training_info['rewards_list'].append(rewards)
                training_info['x_data'].append(self.t)

        self.policy = FinitePolicy.from_q_function(self.Q, self.env)
        V = np.zeros(self.env.observation_space.n)
        for s in range(self.env.observation_space.n):
            V[s] = self.Q[s, self.env.available_actions(s)].max()

        training_info['V'] = V
        return training_info
Пример #2
0
    def __init__(self, env, rmax=1.0, delta=0.1):
        super().__init__()
        self.id = 'UCRL2'
        self.env = deepcopy(env)
        self.rmax = rmax
        self.delta = delta

        # Constants
        self.Ns = self.env.observation_space.n
        self.Na = self.env.action_space.n

        # Initialize policy
        self.policy = FinitePolicy.uniform(self.Ns, self.Na)

        # Arrays
        self.N_sas = np.zeros(
            (self.Ns, self.Na,
             self.Ns))  # N_sas[s,a,s'] = number of visits to (s,a, s')
        self.N_sa = np.zeros((self.Ns, self.Na))
        self.S_sa = np.zeros(
            (self.Ns,
             self.Na))  # S_sa[s, a] = sum of rewards obtained in (s, a)

        # Initialize state
        self.state = self.env.reset()

        # Time counter
        self.t = 0
Пример #3
0
    def __init__(self, env, gamma=0.95, learning_rate=None, min_learning_rate=0.05, epsilon=1.0, epsilon_decay=0.995,
                 epsilon_min=0.01, rmax=1.0, verbose=1, seed_val=42):
        super().__init__()
        # avoid changing the state of original env
        env = deepcopy(env)

        self.id = 'QLearningAgent'
        self.env = env
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.min_learning_rate = min_learning_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.t = 0
        self.state = self.env.reset()
        self.verbose = verbose
        self.seed_val = seed_val
        self.RS = np.random.RandomState(seed_val)

        Ns = self.env.observation_space.n
        Na = self.env.action_space.n
        self.Q = np.ones((Ns, Na))*rmax/(1-gamma)
        self.Nsa = np.zeros((Ns, Na))
        self.policy = FinitePolicy.from_q_function(self.Q, self.env)
Пример #4
0
    def policy_iteration_step(self, policy):
        # Policy evaluation
        V = policy.evaluate(self.env, self.gamma)

        # Policy improvement
        new_policy = FinitePolicy.from_v_function(V, self.env, self.gamma)

        return new_policy
Пример #5
0
    def run(self, T):
        nu = np.zeros((self.Ns, self.Na))
        state = self.state
        action = self.policy.sample(state)

        while self.t <= T:
            print("new episode!, t=", self.t)
            while nu[state, action] < max(1, self.N_sa[state, action]):
                state, action, R_hat, P_hat = self.step()
                nu[state, action] += 1
                if self.t > T:
                    break
            # New episode
            tol = self.rmax / np.sqrt(self.t)
            u, q = self._extended_value_iteration(R_hat, P_hat, tol)
            self.policy = FinitePolicy.from_q_function(q, self.env)
            nu = np.zeros((self.Ns, self.Na))
Пример #6
0
def test_bellman_operator_monotonicity_and_contraction(gamma, seed):
    env = ToyEnv1(seed)
    V0 = np.array([1.0, 100.0, 1000.0])
    V1 = np.array([2.0, 120.0, 1200.0])

    policy_array = np.array([[0.2, 0.8], [0.5, 0.5], [0.9, 0.9]])
    policy = FinitePolicy(policy_array, seed)

    dp_agent = DynProgAgent(env, gamma=gamma)

    TV0, _ = dp_agent.bellman_opt_operator(V0)
    TV1, _ = dp_agent.bellman_opt_operator(V1)

    TpiV0 = dp_agent.bellman_operator(V0, policy)
    TpiV1 = dp_agent.bellman_operator(V1, policy)

    # Test monotonicity
    assert np.greater(TV0, TV1).sum() == 0
    assert np.greater(TpiV0, TpiV1).sum() == 0

    # Test contraction
    norm_tv = np.abs(TV1 - TV0).max()
    norm_v = np.abs(V1 - V0).max()
    assert norm_tv <= gamma * norm_v
Пример #7
0
    def train(self, V_init=None, val_it_tol=1e-8, val_it_max_it=1e4,
                    pi_init=None, pol_it_max_it=1e3):
        """
        Train the agent. Returns estimated value function and training info.
        :param V_init:
        :param val_it_tol:
        :param val_it_max_it:
        :param pi_init:
        :param pol_it_max_it:
        :return V, training_info:
        """
        training_info = {}

        if self.method == 'value-iteration':
            if V_init is None:
                V = np.zeros(self.env.observation_space.n)
            else:
                V = V_init

            it = 1
            while True:
                TV, Q, err = self.value_iteration_step(V)

                if it > val_it_max_it:
                    warnings.warn("Value iteration: Maximum number of iterations exceeded.")

                if err < val_it_tol or it > val_it_max_it:
                    self.Q = Q
                    self.V = TV
                    self.policy = FinitePolicy.from_q_function(Q, self.env)
                    return TV, training_info

                V = TV
                it += 1

        elif self.method == 'policy-iteration':
            Na = self.env.action_space.n
            Ns = self.env.observation_space.n
            if pi_init is None:
                action_array = np.array([self.env.available_actions(s)[0] for s in self.env.states])
                policy = FinitePolicy.from_action_array(action_array, Na)
            else:
                policy = pi_init

            it = 1
            while True:
                new_policy = self.policy_iteration_step(policy)

                if it > pol_it_max_it:
                    warnings.warn("Maximum number of iterations exceeded.")

                if new_policy == policy or it > pol_it_max_it:
                    V = policy.evaluate(self.env, self.gamma)
                    self.V = V
                    _, Q = self.bellman_opt_operator(V)
                    self.Q = Q
                    self.policy = policy
                    return V, training_info

                it += 1
                policy = new_policy
Пример #8
0
ql_agent = QLearningUcbAgent(env,
                             gamma=gamma,
                             learning_rate=None,
                             min_learning_rate=0.1,
                             c_expl=4.0)

training_info = ql_agent.train(n_steps=1000, eval_params={'n_sim': 10})

# # Visualize policy
# env.reset()
# env.render(mode='auto', policy=ql_agent.policy)
#
# # Visualize training curve
# ql_agent.plot_rewards(training_info['rewards_list'], training_info['x_data'], show=True)

dp_agent = DynProgAgent(env, gamma=gamma, method='policy-iteration')

# Draw history
# draw_grid_world_state_distribution(ql_agent.env)

action_freq = get_action_frequency(ql_agent.env)
policy_freq = FinitePolicy(action_freq)

# visualize_exploration(ql_agent.env, show=False)
# env.render('manual')
# plt.show()
#
env_eval.reset()
env_eval.render(policy=ql_agent.policy)
env_eval.reset()
env_eval.render(policy=policy_freq)