def train(self, n_steps=1e5, horizon=np.inf, eval_every=100, eval_params=None): """ Train the agent. Returns estimated value function and training info. If horizon = np.inf, run for n_steps If horizon = H, number of episodes = n_steps/H :param n_steps: :param horizon: :param eval_every: interval for evaluating the agent :param eval_params: dictionary containing parameters to send to Agent.eval() :return: """ training_info = {} training_info['rewards_list'] = [] training_info['x_data'] = [] training_info['n_episodes'] = 0 training_info['episode_total_reward'] = [] episode_reward = 0 while self.t < n_steps: done, reward = self.step() episode_reward += reward if done or ((self.t+1) % horizon == 0): self.state = self.env.reset() training_info['n_episodes'] += 1 training_info['episode_total_reward'].append(episode_reward) if self.verbose > 0 and (training_info['n_episodes'] % 500 == 0): print("Episode %d, total reward = %0.2f" % (training_info['n_episodes'], episode_reward)) episode_reward = 0 if self.verbose > 0: if (self.t+1) % 1000 == 0: print("Q-learning iteration %d out of %d" % (self.t+1, n_steps)) self.t += 1 if self.t % eval_every == 0: self.policy = FinitePolicy.from_q_function(self.Q, self.env) if eval_params is None: rewards = self.eval() else: rewards = self.eval(**eval_params) training_info['rewards_list'].append(rewards) training_info['x_data'].append(self.t) self.policy = FinitePolicy.from_q_function(self.Q, self.env) V = np.zeros(self.env.observation_space.n) for s in range(self.env.observation_space.n): V[s] = self.Q[s, self.env.available_actions(s)].max() training_info['V'] = V return training_info
def __init__(self, env, rmax=1.0, delta=0.1): super().__init__() self.id = 'UCRL2' self.env = deepcopy(env) self.rmax = rmax self.delta = delta # Constants self.Ns = self.env.observation_space.n self.Na = self.env.action_space.n # Initialize policy self.policy = FinitePolicy.uniform(self.Ns, self.Na) # Arrays self.N_sas = np.zeros( (self.Ns, self.Na, self.Ns)) # N_sas[s,a,s'] = number of visits to (s,a, s') self.N_sa = np.zeros((self.Ns, self.Na)) self.S_sa = np.zeros( (self.Ns, self.Na)) # S_sa[s, a] = sum of rewards obtained in (s, a) # Initialize state self.state = self.env.reset() # Time counter self.t = 0
def __init__(self, env, gamma=0.95, learning_rate=None, min_learning_rate=0.05, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, rmax=1.0, verbose=1, seed_val=42): super().__init__() # avoid changing the state of original env env = deepcopy(env) self.id = 'QLearningAgent' self.env = env self.gamma = gamma self.learning_rate = learning_rate self.min_learning_rate = min_learning_rate self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min self.t = 0 self.state = self.env.reset() self.verbose = verbose self.seed_val = seed_val self.RS = np.random.RandomState(seed_val) Ns = self.env.observation_space.n Na = self.env.action_space.n self.Q = np.ones((Ns, Na))*rmax/(1-gamma) self.Nsa = np.zeros((Ns, Na)) self.policy = FinitePolicy.from_q_function(self.Q, self.env)
def policy_iteration_step(self, policy): # Policy evaluation V = policy.evaluate(self.env, self.gamma) # Policy improvement new_policy = FinitePolicy.from_v_function(V, self.env, self.gamma) return new_policy
def run(self, T): nu = np.zeros((self.Ns, self.Na)) state = self.state action = self.policy.sample(state) while self.t <= T: print("new episode!, t=", self.t) while nu[state, action] < max(1, self.N_sa[state, action]): state, action, R_hat, P_hat = self.step() nu[state, action] += 1 if self.t > T: break # New episode tol = self.rmax / np.sqrt(self.t) u, q = self._extended_value_iteration(R_hat, P_hat, tol) self.policy = FinitePolicy.from_q_function(q, self.env) nu = np.zeros((self.Ns, self.Na))
def test_bellman_operator_monotonicity_and_contraction(gamma, seed): env = ToyEnv1(seed) V0 = np.array([1.0, 100.0, 1000.0]) V1 = np.array([2.0, 120.0, 1200.0]) policy_array = np.array([[0.2, 0.8], [0.5, 0.5], [0.9, 0.9]]) policy = FinitePolicy(policy_array, seed) dp_agent = DynProgAgent(env, gamma=gamma) TV0, _ = dp_agent.bellman_opt_operator(V0) TV1, _ = dp_agent.bellman_opt_operator(V1) TpiV0 = dp_agent.bellman_operator(V0, policy) TpiV1 = dp_agent.bellman_operator(V1, policy) # Test monotonicity assert np.greater(TV0, TV1).sum() == 0 assert np.greater(TpiV0, TpiV1).sum() == 0 # Test contraction norm_tv = np.abs(TV1 - TV0).max() norm_v = np.abs(V1 - V0).max() assert norm_tv <= gamma * norm_v
def train(self, V_init=None, val_it_tol=1e-8, val_it_max_it=1e4, pi_init=None, pol_it_max_it=1e3): """ Train the agent. Returns estimated value function and training info. :param V_init: :param val_it_tol: :param val_it_max_it: :param pi_init: :param pol_it_max_it: :return V, training_info: """ training_info = {} if self.method == 'value-iteration': if V_init is None: V = np.zeros(self.env.observation_space.n) else: V = V_init it = 1 while True: TV, Q, err = self.value_iteration_step(V) if it > val_it_max_it: warnings.warn("Value iteration: Maximum number of iterations exceeded.") if err < val_it_tol or it > val_it_max_it: self.Q = Q self.V = TV self.policy = FinitePolicy.from_q_function(Q, self.env) return TV, training_info V = TV it += 1 elif self.method == 'policy-iteration': Na = self.env.action_space.n Ns = self.env.observation_space.n if pi_init is None: action_array = np.array([self.env.available_actions(s)[0] for s in self.env.states]) policy = FinitePolicy.from_action_array(action_array, Na) else: policy = pi_init it = 1 while True: new_policy = self.policy_iteration_step(policy) if it > pol_it_max_it: warnings.warn("Maximum number of iterations exceeded.") if new_policy == policy or it > pol_it_max_it: V = policy.evaluate(self.env, self.gamma) self.V = V _, Q = self.bellman_opt_operator(V) self.Q = Q self.policy = policy return V, training_info it += 1 policy = new_policy
ql_agent = QLearningUcbAgent(env, gamma=gamma, learning_rate=None, min_learning_rate=0.1, c_expl=4.0) training_info = ql_agent.train(n_steps=1000, eval_params={'n_sim': 10}) # # Visualize policy # env.reset() # env.render(mode='auto', policy=ql_agent.policy) # # # Visualize training curve # ql_agent.plot_rewards(training_info['rewards_list'], training_info['x_data'], show=True) dp_agent = DynProgAgent(env, gamma=gamma, method='policy-iteration') # Draw history # draw_grid_world_state_distribution(ql_agent.env) action_freq = get_action_frequency(ql_agent.env) policy_freq = FinitePolicy(action_freq) # visualize_exploration(ql_agent.env, show=False) # env.render('manual') # plt.show() # env_eval.reset() env_eval.render(policy=ql_agent.policy) env_eval.reset() env_eval.render(policy=policy_freq)