示例#1
0
def td_lambda_backward_view(iterations, N0, discount_factor, Lambda,
                            value_star):
    MSEs = []
    actions = ["Hit", "Stick"]
    action_value = np.array([[[0.0, 0.0] for i in range(0, 22)]
                             for j in range(10)])
    number_action_value = np.array([[[0.0, 0.0] for i in range(0, 22)]
                                    for j in range(10)])
    eligibility_traces = None

    for it in range(iterations):
        """plays one episode"""
        eligibility_traces = np.array([[[0.0, 0.0] for i in range(0, 22)]
                                       for j in range(10)])
        game = Easy21()
        visits = []
        ##Action chosen epsilon-greedily
        first_state = game.state
        index_action = policy_epsilon_greedy(N0, first_state["dealer"],
                                             first_state["player_sum"],
                                             action_value, number_action_value)
        """plays game epsilon-greedily"""
        while game.isTerminal == False:

            last_state = game.state
            dealer, player_sum = last_state["dealer"], last_state["player_sum"]

            pick_action = actions[index_action]
            number_action_value[dealer - 1, player_sum, index_action] += 1
            alpha = 1 / number_action_value[dealer - 1, player_sum,
                                            index_action]

            _, reward = game.step(pick_action)
            eligibility_traces[dealer - 1, player_sum, index_action] += 1

            if game.isTerminal == False:
                next_state = game.state
                next_dealer, next_player_sum = next_state[
                    "dealer"], next_state["player_sum"]
                next_index_action = policy_epsilon_greedy(
                    N0, next_dealer, next_player_sum, action_value,
                    number_action_value)
                target = reward + discount_factor * action_value[
                    next_dealer - 1, next_player_sum, next_index_action]
                index_action = next_index_action

            else:
                target = reward
            delta = target - action_value[dealer - 1, player_sum, index_action]
            delta_tot = eligibility_traces * delta * alpha
            ##We update all states and actions
            action_value += delta_tot

            eligibility_traces = discount_factor * Lambda * eligibility_traces
        """episode ended"""

        error_episode = np.linalg.norm(get_value(action_value) -
                                       value_star)**2 / (2 * 22 * 10)
        MSEs.append(error_episode)
    return MSEs, action_value
示例#2
0
def lfa_control(num_episodes, td_lambda, Q_MC):
    """
    :param num_episodes: Positive integer
    :param td_lambda: value between 0 to 1
    :param Q_MC:Q from MC
    :return: Q, V and mean square error
    """
    env = Easy21()  # instantiate the env
    gamma = 1
    epsilon = 0.05
    alpha = 0.01

    # Initialization
    Q = np.zeros((2, 11, 22))
    weight = np.zeros((36,))
    mean_square_error = []

    # looping through number of episodes
    for i_episode in range(1, num_episodes + 1):

        bin_feat_vector = np.zeros((36,))
        state = env.reset()  # start with a initial state
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        while True:
            if np.random.rand() < epsilon:
                action = np.random.randint(0, 2)
            else:
                action = np.argmax([np.sum(phi(state, 0)*weight),
                                    np.sum(phi(state, 1)*weight)])

            next_state, reward, done = env.step(action)

            if np.random.rand() < epsilon:
                next_action = np.random.randint(0, 2)
            else:
                next_action = np.argmax([np.sum(phi(next_state, 0)*weight),
                                        np.sum(phi(next_state, 1)*weight)])
            cuboid = phi(state, action)
            cuboid_next = phi(next_state, next_action)

            Q_phi = np.sum(cuboid * weight)
            Q_phi_next = np.sum(cuboid_next * weight)

            td_lambda_error = reward + gamma * Q_phi_next - Q_phi
            bin_feat_vector = bin_feat_vector * td_lambda * gamma + cuboid
            delta_weight = alpha * td_lambda_error * bin_feat_vector

            weight += delta_weight

            state = next_state

            if done:
                break

        Q = calculate_Q(weight)
        mean_square_error.append(np.mean((Q_MC - Q[:, 1:11, 1:22]) ** 2))

    return Q[:, 1:11, 1:22], np.max(Q[:, 1:11, 1:22], axis=0), mean_square_error
def sarsa_control(N0, num_episodes, td_lambda, Q_MC):
    """
    :param N0: constant integer
    :param num_episodes: Positive integer
    :param td_lambda: value between 0 to 1
    :param Q_MC: Q_MC:Q from MC
    :return: Q and mean square error
    """
    env = Easy21()  # instantiate the env
    NS = np.zeros((11, 22))
    NSA = np.zeros((2, 11, 22))
    Q = np.zeros((2, 11, 22))
    gamma = 1

    # Initializing  empty list of the mean square error
    mean_square_error = []

    # Episodes
    for i_episode in range(0, num_episodes):
        episode_state_action = np.zeros((2, 11, 22))
        state = env.reset()  # Initiate state
        action = np.random.randint(0, 2)  # Initiate action
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        while True:
            next_state, reward, done = env.step(action)

            # Update epsilon
            NS[state[0], state[1]] += 1
            epsilon = N0 / (N0 + NS[state[0], state[1]])

            greedy_action = epsilon_greedy(Q, next_state, epsilon)

            # making updates
            QSA = Q[greedy_action, next_state[0],
                    next_state[1]] - Q[action, state[0], state[1]]
            td_lambda_error = reward + gamma * QSA
            episode_state_action[action, state[0], state[1]] += 1

            # Update alpha
            NSA[action, state[0], state[1]] += 1
            alpha = 1 / NSA[action, state[0], state[1]]

            Q += alpha * td_lambda_error * episode_state_action
            episode_state_action *= td_lambda * gamma

            state, action = next_state, greedy_action
            if done:
                break

        mean_square_error.append(np.mean((Q_MC - Q[:, 1:11, 1:22])**2))

    return Q[:, 1:11, 1:22], np.max(Q[:, 1:11, 1:22],
                                    axis=0), mean_square_error
    def __init__(self, N0=100):
        self._env = Easy21()
        self._actions = self._env.actionSpace()

        # linear approx. function and parameters
        self._w = np.zeros([36, 1])
        self._Q = lambda d, p, a: np.dot(self._feature(d, p, a).T, self._w)

        self._eps = 0.05
        self._alpha = 0.01
示例#5
0
def montecarlo(iterations, it_conf, N0, discount_factor, true_value):
    """Computes Monte-Carlo algorithm"""
    actions = ["Hit", "Stick"]
    action_value = np.array([[[0.0, 0.0] for i in range(0, 22)]
                             for j in range(10)])
    number_action_value = np.array([[[0, 0] for i in range(0, 22)]
                                    for j in range(10)])
    deltas = []
    variance = []

    for it in range(iterations):
        if it % it_conf == 0:
            print("Iteration n°{}/{}".format(it / int(1e3),
                                             iterations / int(1e3)))
        var = 0
        """plays one episode"""
        game = Easy21()
        Gt = 0
        k = 0
        visits = []
        while game.isTerminal == False:
            last_state = game.state
            dealer, player_sum = last_state["dealer"], last_state["player_sum"]
            action_value_ij = action_value[dealer - 1, player_sum]

            ##Pick action epsilon-greedily
            index_action = policy_epsilon_greedy(N0, dealer, player_sum,
                                                 action_value,
                                                 number_action_value)
            pick_action = actions[index_action]

            state, reward = game.step(pick_action)
            visits.append([last_state, index_action])
            number_action_value[dealer - 1, player_sum, index_action] += 1
            Gt += reward * discount_factor**k
            k += 1
        delta = 0
        """episode ended"""
        for step in visits:
            state = step[0]
            action = step[1]
            dealer, player_sum = state["dealer"], state["player_sum"]
            delta_action_value = (
                Gt - action_value[dealer - 1, player_sum, action]
            ) / number_action_value[dealer - 1, player_sum, action]
            action_value[dealer - 1, player_sum, action] += delta_action_value
            var += abs(delta_action_value)
        variance.append(var)
        deltas.append(
            np.linalg.norm(true_value - get_value(action_value))**2 /
            (10 * 21))

    return action_value, deltas, variance
示例#6
0
    def __init__(self, N0=100):
        self._env = Easy21()
        self._actions = self._env.actionSpace()
        self._N0 = N0
        self._Q = np.zeros((11, 22, 2))  # action-value function, tabular
        self._Nsa = np.zeros(
            (11, 22, 2))  # number of times s,a has been selected

        # number of times s has been visited
        self._Ns = lambda d, p: sum(self._Nsa[d, p])

        # ε of each state
        self._eps = lambda d, p: self._N0 / (self._N0 + self._Ns(d, p))

        # alpha of each s,a pair
        self._alpha = lambda d, p, a: 1 / self._Nsa[d, p, a]
def mc_control(N0=100, num_episodes=1000):
    """

    :param N0:integer constant
    :param num_episodes:Positive integer
    :return: Q & V
    """
    # Get the environment
    env = Easy21()  # instantiate the env

    # initializing Zeros arrays for num of state visited and state action pair
    NS = np.zeros((11, 22))
    NSA = np.zeros((2, 11, 22))

    # Initializing state action function
    Q = np.zeros((2, 11, 22))

    # looping over episodes
    for i_episode in range(1, num_episodes + 1):
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        episode = []
        state = env.reset()
        episode.append(state)

        Gt = 0

        while True:
            # Update epsilon values
            NS[state[0], state[1]] += 1
            epsilon = N0 / (N0 + NS[state[0], state[1]])

            action = epsilon_greedy(Q, state, epsilon)
            episode.append(action)

            # Update alpha values
            NSA[action, state[0], state[1]] += 1
            alpha = 1 / NSA[action, state[0], state[1]]

            state, reward, done = env.step(action)

            # Sum the reward
            Gt += reward

            if done:
                break
            else:
                # Append state to the episode
                episode.append(state)

                # Update all states
        for index, event in enumerate(episode):
            if index % 2 == 0:
                state = event
            else:
                action = event
                Q[action, state[0],
                  state[1]] += alpha * (Gt - Q[action, state[0], state[1]])

    return Q[:, 1:11, 1:22], np.max(Q[:, 1:11, 1:22], axis=0)