Python Policy.action示例

class BotPlayer(Player):
    """Player when played by program

    Action to take for a hand is decided according to a policy. At first, the policy is based in MDP
    """
    def __init__(self, policy, dealer):
        super(BotPlayer, self).__init__()
        self.policy = Policy(policy)
        self.dealer = dealer

    def choose_action(self):
        # Policy determines choice
        choice = self.policy.action(self.hand_value, self.dealer.cards[0])
        return choice

示例#2

显示文件

def semi_gradient_n_step_td(
    env,  #open-ai environment
    gamma: float,
    pi: Policy,
    n: int,
    alpha: float,
    V: ValueFunctionWithApproximation,
    num_episode: int,
):
    """
    implement n-step semi gradient TD for estimating v

    input:
        env: target environment
        gamma: discounting factor
        pi: target evaluation policy
        n: n-step
        alpha: learning rate
        V: value function
        num_episode: #episodes to iterate
    output:
        None
    """
    #TODO: implement this function
    for ep in range(num_episode):
        print(ep)
        s0 = env.reset()
        T = math.inf
        t = 0
        states = [s0]
        rewards = []
        while t < T + n - 2:
            if t < T:
                action = pi.action(states[-1])
                obs, reward, done, _ = env.step(action)
                states.append(obs)
                rewards.append(reward)
                if done:
                    T = t + 1
            tau = t - n + 1
            if tau >= 0:
                G = 0
                for i in range(min(tau + n, T), tau, -1):
                    G = gamma * G + rewards[i - 1]
                if tau + n < T:
                    G += (gamma**n) * V(states[tau + n])
                V.update(alpha, G, states[tau])
            t += 1

示例#3

显示文件

def semi_gradient_n_step_td(
    env,  #open-ai environment
    gamma: float,
    pi: Policy,
    n: int,
    alpha: float,
    V: ValueFunctionWithApproximation,
    num_episode: int,
):
    """
    implement n-step semi gradient TD for estimating v

    input:
        env: target environment
        gamma: discounting factor
        pi: target evaluation policy
        n: n-step
        alpha: learning rate
        V: value function
        num_episode: #episodes to iterate
    output:
        None
    """
    #TODO: implement this function
    gamma_vec = np.zeros(n + 1)
    gamma_vec[0] = 1
    for i in range(n):
        gamma_vec[i + 1] = gamma_vec[i] * gamma

    for eps in range(num_episode):
        print(V(np.array([0.48690072, 0.04923175])))
        observation = env.reset()
        T = float('inf')
        t = 0
        R = [0] * n
        S = [0] * n
        while True:
            #env.render()
            if t < T:
                action = pi.action(observation)
                observation, reward, done, info = env.step(action)
                R.pop(0)
                R.append(reward)
                S.pop(0)
                S.append(observation)
                if done:
                    T = t + 1
            tau = t - n + 1
            if tau > 0:
                if (tau + n) < T:
                    Vs_prime = V(S[n - 1])
                    G = sum(gamma_vec * np.append(R, Vs_prime))
                    s_tau = S[0]
                else:
                    rem_step = T - tau
                    R = R[-rem_step:]
                    gamma_vec_modf = gamma_vec[0:rem_step]
                    G = sum(gamma_vec_modf * R)
                    s_tau = S[-rem_step]

                V.update(alpha, G, s_tau)

            if tau == T - 1:
                break
            else:
                t = t + 1

示例#4

显示文件

文件： algo.py 项目： BharathMasetty/EE381V-Reinforcement-Learning

def semi_gradient_n_step_td(
    env,  # open-ai environment
    gamma: float,
    pi: Policy,
    n: int,
    alpha: float,
    V: ValueFunctionWithApproximation,
    num_episode: int,
):
    """
    implement n-step semi gradient TD for estimating v

    input:
        env: target environment
        gamma: discounting factor
        pi: target evaluation policy
        n: n-step
        alpha: learning rate
        V: value function
        num_episode: # episodes to iterate
    output:
        None
    """
    # TODO: implement this function

    for i in range(num_episode):
        observation = env.reset()
        T = 200
        t = 0
        tau = 0
        # initialize S and R lists for the episode
        S = []
        R = []
        S.append(observation)
        R.append([0])

        while tau != T - 1:
            if t < T:
                # env.render()
                action = pi.action(observation)
                observation, reward, done, info = env.step(action)
                # we have information about St+1, Rt+1, and termination
                S.append(observation)
                R.append(reward)

                if done:
                    T = t + 1

            tau = t - n + 1

            if tau >= 0:
                G = 0

                for j in range(tau + 1, min(tau + n, T) + 1):
                    G = G + (gamma**(j - tau - 1)) * R[j]

                if tau + n < T:
                    G = G + V(S[tau + n]) * gamma**n

                V.update(alpha, G, S[tau])

            t = t + 1