class BotPlayer(Player): """Player when played by program Action to take for a hand is decided according to a policy. At first, the policy is based in MDP """ def __init__(self, policy, dealer): super(BotPlayer, self).__init__() self.policy = Policy(policy) self.dealer = dealer def choose_action(self): # Policy determines choice choice = self.policy.action(self.hand_value, self.dealer.cards[0]) return choice
def semi_gradient_n_step_td( env, #open-ai environment gamma: float, pi: Policy, n: int, alpha: float, V: ValueFunctionWithApproximation, num_episode: int, ): """ implement n-step semi gradient TD for estimating v input: env: target environment gamma: discounting factor pi: target evaluation policy n: n-step alpha: learning rate V: value function num_episode: #episodes to iterate output: None """ #TODO: implement this function for ep in range(num_episode): print(ep) s0 = env.reset() T = math.inf t = 0 states = [s0] rewards = [] while t < T + n - 2: if t < T: action = pi.action(states[-1]) obs, reward, done, _ = env.step(action) states.append(obs) rewards.append(reward) if done: T = t + 1 tau = t - n + 1 if tau >= 0: G = 0 for i in range(min(tau + n, T), tau, -1): G = gamma * G + rewards[i - 1] if tau + n < T: G += (gamma**n) * V(states[tau + n]) V.update(alpha, G, states[tau]) t += 1
def semi_gradient_n_step_td( env, #open-ai environment gamma: float, pi: Policy, n: int, alpha: float, V: ValueFunctionWithApproximation, num_episode: int, ): """ implement n-step semi gradient TD for estimating v input: env: target environment gamma: discounting factor pi: target evaluation policy n: n-step alpha: learning rate V: value function num_episode: #episodes to iterate output: None """ #TODO: implement this function gamma_vec = np.zeros(n + 1) gamma_vec[0] = 1 for i in range(n): gamma_vec[i + 1] = gamma_vec[i] * gamma for eps in range(num_episode): print(V(np.array([0.48690072, 0.04923175]))) observation = env.reset() T = float('inf') t = 0 R = [0] * n S = [0] * n while True: #env.render() if t < T: action = pi.action(observation) observation, reward, done, info = env.step(action) R.pop(0) R.append(reward) S.pop(0) S.append(observation) if done: T = t + 1 tau = t - n + 1 if tau > 0: if (tau + n) < T: Vs_prime = V(S[n - 1]) G = sum(gamma_vec * np.append(R, Vs_prime)) s_tau = S[0] else: rem_step = T - tau R = R[-rem_step:] gamma_vec_modf = gamma_vec[0:rem_step] G = sum(gamma_vec_modf * R) s_tau = S[-rem_step] V.update(alpha, G, s_tau) if tau == T - 1: break else: t = t + 1
def semi_gradient_n_step_td( env, # open-ai environment gamma: float, pi: Policy, n: int, alpha: float, V: ValueFunctionWithApproximation, num_episode: int, ): """ implement n-step semi gradient TD for estimating v input: env: target environment gamma: discounting factor pi: target evaluation policy n: n-step alpha: learning rate V: value function num_episode: # episodes to iterate output: None """ # TODO: implement this function for i in range(num_episode): observation = env.reset() T = 200 t = 0 tau = 0 # initialize S and R lists for the episode S = [] R = [] S.append(observation) R.append([0]) while tau != T - 1: if t < T: # env.render() action = pi.action(observation) observation, reward, done, info = env.step(action) # we have information about St+1, Rt+1, and termination S.append(observation) R.append(reward) if done: T = t + 1 tau = t - n + 1 if tau >= 0: G = 0 for j in range(tau + 1, min(tau + n, T) + 1): G = G + (gamma**(j - tau - 1)) * R[j] if tau + n < T: G = G + V(S[tau + n]) * gamma**n V.update(alpha, G, S[tau]) t = t + 1