예제 #1
0
def main():
    parameter = get_args()
    agent, environment = build_objects()

    # At first, the agent is exploring
    agent.exploring = True
    #Executes the number of training steps specified in the -t parameter
    for step in range(parameter.training_steps):
        #The first step is to define the current state
        state = environment.get_state()
        #The agent selects the action according to the state
        action = agent.select_action(state)
        #The state transition is processed
        statePrime, action, reward = environment.step(action)
        #The agent Q-update is performed
        agent.observe_reward(state, action, statePrime, reward)
        print("***Training step " + str(step + 1) + " Completed")
    #Now that the training has finished, the agent can use his policy without updating it
    agent.exploring = False
    # Executes the number of evaluation steps specified in the -e parameter
    for step in range(parameter.evaluation_steps):
        #Mostly the same as training, but without observing the rewards
        #The first step is to define the current state
        state = environment.get_state()
        #The agent selects the action according to the state
        action = agent.select_action(state)
        #The state transition is processed
        environment.step(action)
        print("***Evaluation step " + str(step + 1) + " Completed")
예제 #2
0
def monte_carlo(iterations=1000000,
                policy=policies.epsilon_greedy,
                n_zero=100):
    """ Performs Monte Carlo control in the Easy21 game.

    :param iterations: number of monte carlo iterations
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :return: value function and the plot of the optimal value function
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # number of wins
    wins = 0

    print('Iterations completed:')
    for i in xrange(iterations):

        if (i % 500000) == 0:
            print(i)

        # create a new random starting state
        state = environment.State()
        # play a round
        observed_keys = []
        while not state.terminal:
            player = state.player_sum
            dealer = state.dealer_first_card

            # find an action defined by the policy
            epsilon = n_zero / float(n_zero + counter_state[(player, dealer)])
            action = policy(epsilon, value_function, state)
            observed_keys.append((player, dealer, action))

            # take a step
            [state, reward] = environment.step(state, action)

        # we have reached an end of episode
        if reward is not None:
            # update over all keys
            for key in observed_keys:
                # update counts
                counter_state[key[:-1]] += 1
                counter_state_action[key] += 1

                # update value function
                alpha = 1.0 / counter_state_action[key]
                value_function[key] += alpha * (reward - value_function[key])

        if reward == 1:
            wins += 1

    print('Wins: %.4f%%' % ((float(wins) / iterations) * 100))
    # plot the optimal value function
    plotting.plot_value_function(value_function)
    return value_function
def monte_carlo(iterations=1000000, policy=policies.epsilon_greedy, n_zero=100):
    """ Performs Monte Carlo control in the Easy21 game.

    :param iterations: number of monte carlo iterations
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :return: value function and the plot of the optimal value function
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # number of wins
    wins = 0

    print('Iterations completed:')
    for i in xrange(iterations):

        if (i % 500000) == 0:
            print(i)

        # create a new random starting state
        state = environment.State()
        # play a round
        observed_keys = []
        while not state.terminal:
            player = state.player_sum
            dealer = state.dealer_first_card

            # find an action defined by the policy
            epsilon = n_zero / float(n_zero + counter_state[(player, dealer)])
            action = policy(epsilon, value_function, state)
            observed_keys.append((player, dealer, action))

            # take a step
            [state, reward] = environment.step(state, action)

        # we have reached an end of episode
        if reward is not None:
            # update over all keys
            for key in observed_keys:
                # update counts
                counter_state[key[:-1]] += 1
                counter_state_action[key] += 1

                # update value function
                alpha = 1.0 / counter_state_action[key]
                value_function[key] += alpha * (reward - value_function[key])

        if reward == 1:
            wins += 1

    print('Wins: %.4f%%' % ((float(wins) / iterations) * 100))
    # plot the optimal value function
    plotting.plot_value_function(value_function)
    return value_function
예제 #4
0
파일: main.py 프로젝트: corback/soa
def step(): # what happens in each cycle. Main calls happen here.
    global count_steps
    count_steps += 1
    print "count_steps "+str(count_steps)
    nodes = environment.step()
    wakeup_nodes(nodes) # calls their behaviour, which calls the policy (what to do), and finally applies actions
    print "killed edges: "+str(stats.collector.num_kills)
    stats.snapshot()
    print "total graph weight:"+str(stats.collector.total_weight)
    stats.new_collector()
예제 #5
0
    def watch_play(self):
        done = False
        board = env.reset()
        while not done:
            # finds the best action
            action = env.process_state(board)
            self.drop_piece(action, board)
            board, done = env.step(board, *action)

            self.board = board.area
            self.update()
            self.root.update()
            time.sleep(self.speed)
예제 #6
0
def mc_control(num_episodes=10000):
    q_sa = {}
    p = {}
    n_s = {}
    n_sa = {}
    n0 = 100

    for _ in range(num_episodes):
        state = State()
        reward = 0
        episode_s = []
        episode_sa = []

        while not state.terminal:
            s = state.as_tuple()
            if s in p:
                a = sample_action(p[s])
            else:
                a = Action.random()

            episode_s.append(s)
            episode_sa.append(s + (a, ))
            state, reward = step(state, a)

            ns = n_s.get(s, 0)
            n_s[s] = ns + 1

            sa = s + (a, )
            nsa = n_sa.get(sa, 0)
            n_sa[sa] = nsa + 1

        # GLIE MC Control
        for sa in set(episode_sa):
            nsa = n_sa[sa]
            qsa = q_sa.get(sa, 0)
            q_sa[sa] = qsa + ((reward - qsa) / nsa)

        # Improve policy
        for s in set(episode_s):
            a_best = greedy_action(q_sa, s)
            ns = n_s.get(s, 0)
            epsilon = n0 / (n0 + ns)

            selection_probs = []
            for a in list(Action):
                if a is a_best:
                    selection_probs.append(1 - epsilon + epsilon / len(Action))
                else:
                    selection_probs.append(epsilon / len(Action))
            p[s] = selection_probs
    return q_sa
예제 #7
0
파일: main.py 프로젝트: stablum/soa
def iteration(): # what happens in each cycle. Main calls happen here.
    print "iteration!"
    global count_iterations
    print "count_iterations "+str(count_iterations)
    nodes = environment.step()
    wakeup_nodes(nodes) # calls their behaviour, which calls the policy (what to do), and finally applies actions
    print "killed edges: "+str(stats.collector.num_kills)
    stats.snapshot()
    print "total graph weight:"+str(stats.collector.total_weight)
    print "average path length, see note1:"+ str(stats.collector.path_length)
    print "mean edge importance:"+str(stats.collector.mean_edges_importance)
    print "std edge importance:"+str(stats.collector.std_edges_importance)
    count_iterations +=1
    stats.new_collector()
예제 #8
0
def get_sample(env, q, epsilon):
    s = env["start_state"]
    h_s = [s]
    h_a = []
    h_r = []
    over = False
    step = 0
    max_step = env["n_row"] * env["n_col"] * 3

    while not over and step < max_step:
        step += 1
        a = epsilon_greedy_select(q[s], epsilon)
        s, r, over = environment.step(env, s, a)
        h_a.append(a)
        h_r.append(r)
        h_s.append(s)
        if step == max_step:
            h_r[len(h_r) - 1] = -1000

    return h_s, h_a, h_r
예제 #9
0
파일: part3.py 프로젝트: dteoh/easy21
def sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, yield_progress=False):
    q_sa = {}
    n_s = {}
    n_sa = {}

    for n in range(num_episodes):
        e_sa = {}
        state = State()
        s = state.as_tuple()
        a = epsilon_greedy_action(q_sa, s, calculate_epsilon(n_s, s))
        while not state.terminal:
            state, reward = step(state, a)
            n_s[s] = n_s.get(s, 0) + 1

            s_next = state.as_tuple()
            a_next = epsilon_greedy_action(q_sa, s_next,
                                           calculate_epsilon(n_s, s_next))

            sa = s + (a, )
            sa_next = s_next + (a_next, )
            qsa = q_sa.get(sa, 0)
            qsa_next = q_sa.get(sa_next, 0)

            nsa = n_sa.get(sa, 0) + 1
            n_sa[sa] = nsa

            delta = reward + gamma * qsa_next - qsa
            e_sa[sa] = e_sa.get(sa, 0) + 1
            for (s, a) in generate_all_state_action_pairs():
                sa = s + (a, )
                q_sa[sa] = q_sa.get(sa, 0) + (delta * e_sa.get(sa, 0)) / nsa
                e_sa[sa] = gamma * lamba * e_sa.get(sa, 0)

            s = s_next
            a = a_next

        if yield_progress:
            yield n + 1, q_sa

    if not yield_progress:
        yield num_episodes, q_sa
예제 #10
0
def get_sample_and_learn_online(env, q, bootstrap, discount, step_size,
                                epsilon):
    s = env["start_state"]
    h_s = [s]
    h_a = []
    h_r = []
    over = False
    step = 0
    error = 0

    while not over:
        step += 1
        a = epsilon_greedy_select(q[s], epsilon)
        s_next, r, over = environment.step(env, s, a)

        if not over:
            if bootstrap == BOOTSTRAP_SARSA:
                a_next = epsilon_greedy_select(q[s_next], epsilon)
                q_next = q[s_next, a_next]
            elif bootstrap == BOOTSTRAP_EXPECTED:
                q_next = expected_q(q[s_next], epsilon)
            elif bootstrap == BOOTSTRAP_Q:
                q_next = max(q[s_next])
        else:
            q_next = 0

        delta = r + discount * q_next - q[s, a]
        q[s, a] = q[s, a] + step_size * delta

        s = s_next

        h_a.append(a)
        h_r.append(r)
        h_s.append(s)

        error += abs(step_size * delta)

    return (h_s, h_a, h_r), error
예제 #11
0
def mc_episode():
    states = []  # holds all states of one episode
    actions = []  # holds all actions of one episode

    # create the initial state
    dealer_card = random.randint(1, 10)
    player_card = random.randint(1, 10)
    state = environment.State(dealer_card, player_card)

    # create the initial state
    while not state.terminated:
        states.append(state)

        # define the indices for the state matrix
        dealer_state_index = state.dealer_card - 1
        player_state_index = state.player_sum - 1

        # pick the action
        epsilon = n0 / (
            n0 + state_info[dealer_state_index, player_state_index, ns_index])
        if random.random() < epsilon:
            # exploration, pick a random action
            if random.random() < 0.5:
                action = environment.Action.HIT
            else:
                action = environment.Action.STICK

        else:
            # pick the action greedily (largest action value)
            if state_info[dealer_state_index, player_state_index,
                          q_hit_index] > state_info[dealer_state_index,
                                                    player_state_index,
                                                    q_stick_index]:
                action = environment.Action.HIT
            else:
                action = environment.Action.STICK

        # increment the counts
        state_info[dealer_state_index, player_state_index, ns_index] += 1

        if action == environment.Action.HIT:
            state_info[dealer_state_index, player_state_index,
                       ns_hit_index] += 1

        if action == environment.Action.STICK:
            state_info[dealer_state_index, player_state_index,
                       ns_stick_index] += 1

        # get a new state
        actions.append(action)
        state = environment.step(state, action)

    # update the action values
    for i in range(0, len(states)):
        s = states[i]
        a = actions[i]
        tot_reward = state.reward

        if not s.is_busted:
            dealer_state_index = s.dealer_card - 1
            player_state_index = s.player_sum - 1

            if a == environment.Action.HIT:
                alpha = 1 / state_info[dealer_state_index, player_state_index,
                                       ns_hit_index]
                value = state_info[dealer_state_index, player_state_index,
                                   q_hit_index]
                state_info[dealer_state_index, player_state_index,
                           q_hit_index] += alpha * (tot_reward - value)
            else:
                alpha = 1 / state_info[dealer_state_index, player_state_index,
                                       ns_stick_index]
                value = state_info[dealer_state_index, player_state_index,
                                   q_stick_index]
                state_info[dealer_state_index, player_state_index,
                           q_stick_index] += alpha * (tot_reward - value)
예제 #12
0
# Initialize environment, get initial state and reward
state, reward = env.reset()

# Simulate for training_length steps
for i in range(params.training_length):

    # Run network for 50 ms: Get left and right output spikes, get weights
    n_l, n_r, weights = snn.simulate(state, reward)
    w_l = weights[0]
    w_r = weights[1]

    # Perform a step
    # Get state, distance, pos_data, reward, terminate, steps,
    # travelled_distances, vrep_steps
    (state, distance, pos_data, reward, t, step, travelled_distances,
     vrep_steps) = env.step(n_l, n_r)

    # Save weights every 100 simulation steps
    if i % 10 == 0:
        weights_l.append(w_l)
        weights_r.append(w_r)
        weights_i.append(i)

    # Store distance, position, reward, step
    distances.append(distance)
    positions.append(pos_data)
    rewards.append(reward)
    steps.append(step)

    # Save # steps after the training resets
    if t:
예제 #13
0
q_table = np.load("./qtables/1/800-qtable.npy", allow_pickle=True).item()
for episode in range(EPISODES):
    state = env.reset()

    if state not in q_table:
        q_table[state] = np.random.uniform(
            low=-2, high=0, size=env.action_space_n)

    episode_reward = 0
    done = False

    while not done:
        valid_actions = env.get_valid_actions(0)

        action = max(valid_actions, key=lambda a: q_table[state][a])
        new_state, reward, done = env.step(action)
        episode_reward += reward

        if new_state not in q_table:
            q_table[new_state] = np.random.uniform(
                low=-2, high=0, size=env.action_space_n)

        print(reward, new_state, action)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_state])

            # Current Q value (for current state and performed action)
예제 #14
0
def sarsa_episode(lam):
    # reset all eligibility traces
    state_info[:, :, e_hit_index] = 0
    state_info[:, :, e_stick_index] = 0

    # initialize the state S
    dealer_card = random.randint(1, 10)
    player_card = random.randint(1, 10)
    state = environment.State(dealer_card, player_card)
    features = state.get_features()

    # initialize the action A
    action = environment.Action.HIT
    if random.random() < 0.5:
        action = environment.Action.STICK

    # run one episode
    while not state.terminated:
        # take the action A
        state_new = environment.step(state, action)
        reward = state_new.reward
        features_new = state_new.get_features()

        # pick the next action A' by using epsilon greedy
        action_new = None
        if state_new.terminated:
            action_new = environment.Action.NONE

        else:
            if random.random() < epsilon:
                # exploration, pick a random action
                if random.random() < 0.5:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

            else:
                # pick the action greedily (largest action value)
                v_hit = np.sum(
                    np.multiply(features_new, state_info[:, :, q_hit_index]))
                v_stick = np.sum(
                    np.multiply(features_new, state_info[:, :, q_stick_index]))
                if v_hit > v_stick:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

        # calculate delta
        if action == environment.Action.HIT:
            q_value = np.sum(
                np.multiply(features, state_info[:, :, q_hit_index]))
        else:
            q_value = np.sum(
                np.multiply(features, state_info[:, :, q_stick_index]))

        if state_new.terminated:
            q_value_new = 0

        else:
            if action_new == environment.Action.HIT:
                q_value_new = np.sum(
                    np.multiply(features_new, state_info[:, :, q_hit_index]))
            else:
                q_value_new = np.sum(
                    np.multiply(features_new, state_info[:, :, q_stick_index]))

        delta = reward + q_value_new - q_value

        # increment eligibility trace
        if action == environment.Action.HIT:
            state_info[:, :, e_hit_index] += features
        else:
            state_info[:, :, e_stick_index] += features

        # update all values
        state_info[:, :,
                   q_hit_index] += alpha * delta * state_info[:, :,
                                                              e_hit_index]
        state_info[:, :,
                   q_stick_index] += alpha * delta * state_info[:, :,
                                                                e_stick_index]

        # update all eligibility traces
        state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index]
        state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index]

        # end this step
        state = state_new
        action = action_new
        features = features_new
예제 #15
0
        cumulated_reward = 0  #Should going forward give more reward then L/R ?

        observation = environment.reset()

        if qlearn.epsilon > 0.05:
            qlearn.epsilon *= epsilon_discount

        state = ''.join(map(str, observation))
        # print("State = ",state," observation = ",observation)
        for i in range(1500):

            # Pick an action based on the current state
            action = qlearn.chooseAction(state)

            # Execute the action and get feedback
            observation, reward, done, info = environment.step(action)
            cumulated_reward += reward

            if highest_reward < cumulated_reward:
                highest_reward = cumulated_reward

            nextState = ''.join(map(str, observation))

            qlearn.learn(state, action, reward, nextState)

            # environment.monitor.flush(force=True)
            print(i, " S= ", state, " A = ", action, 'observation = ',
                  observation)
            if not (done):
                state = nextState
            else:
def sarsa_lambda(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy,
                 n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True):
    """ Applies eligibility trace version of Sarsa to the game Easy21

    :param l: lambda parameter
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param gamma: discounting rate
    :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # no. of wins to calculate the percentage of wins at the end
    wins = 0

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # current (player, dealer, action)
        eligibility_trace = defaultdict(float)

        # initial state, action [SA..]
        state = environment.State()
        player_current = state.player_sum
        dealer_current = state.dealer_first_card
        epsilon = n_zero / float(n_zero + counter_state[(player_current, dealer_current)])
        action_current = policy(epsilon, value_function, state)

        while not state.terminal:

            # update counts
            counter_state[(player_current, dealer_current)] += 1
            counter_state_action[(player_current, dealer_current, action_current)] += 1

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            player_next = state.player_sum
            dealer_next = state.dealer_first_card
            epsilon = n_zero / float(n_zero + counter_state[(player_next, dealer_next)])
            action_next = policy(epsilon, value_function, state)

            delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \
                value_function[(player_current, dealer_current, action_current)]

            alpha = 1.0 / counter_state_action[(player_current, dealer_current, action_current)]

            eligibility_trace[(player_current, dealer_current, action_current)] += 1

            # update the values
            for key in value_function:
                value_function[key] += alpha * delta * eligibility_trace[key]
                eligibility_trace[key] *= gamma * l

            player_current = player_next
            dealer_current = dealer_next
            action_current = action_next

        # use it later to calculate the percentage of wins
        if reward == 1:
            wins += 1

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curve
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    # get the percentage of wins
    print float(wins) / max_episodes
    return value_function
예제 #17
0
step = 0
ep = 0
while step < maxSteps:
    ep += 1
    x = environment.reset()  # initialize the state
    C = 0.

    done = False
    t = 1
    while not done:
        t += 1
        step += 1
        a = agent.action(x)
        u = Actions[a]
        #env.render() # only for visual effects
        x_next, c, done = environment.step(u, x)

        C += (1. / t) * (c - C)
        agent.update(x, a, c, x_next, done)
        x = x_next

        if done:
            break

        if step >= maxSteps:
            break

        R.append(C)

    print('Episode:', ep, 'Total Steps:', step, ', Ave. Reward/Power :', c,
          ', Episode Length:', t - 1)
예제 #18
0
def sarsa_episode(lam):
    """
    executes one sarsa episode
    :param lam:     the lambda parameter
    :return:
    """
    # reset all eligibility traces
    state_info[:, :, e_hit_index] = 0
    state_info[:, :, e_stick_index] = 0

    # initialize the state S
    dealer_card = random.randint(1, 10)
    player_card = random.randint(1, 10)
    state = environment.State(dealer_card, player_card)

    # initialize the action A
    action = environment.Action.HIT
    if random.random() < 0.5:
        action = environment.Action.STICK

    # run one episode
    while not state.terminated:
        # define the starting state indices for the state matrix
        dealer_state_index = state.dealer_card - 1
        player_state_index = state.player_sum - 1

        # take the action A
        state_new = environment.step(state, action)
        reward = state_new.reward

        # define the indices of the new state
        dealer_state_index_new = state_new.dealer_card - 1
        player_state_index_new = state_new.player_sum - 1

        # pick the next action A' by using epsilon greedy
        if state_new.terminated:
            action_new = environment.Action.NONE

        else:
            epsilon = n0 / (n0 + state_info[dealer_state_index_new,
                                            player_state_index_new, ns_index])
            if random.random() < epsilon:
                # exploration, pick a random action
                if random.random() < 0.5:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

            else:
                # pick the action greedily (largest action value)
                if state_info[dealer_state_index_new, player_state_index_new,
                              q_hit_index] > state_info[dealer_state_index_new,
                                                        player_state_index_new,
                                                        q_stick_index]:
                    action_new = environment.Action.HIT
                else:
                    action_new = environment.Action.STICK

        # increment the counts
        state_info[dealer_state_index, player_state_index, ns_index] += 1

        if action == environment.Action.HIT:
            state_info[dealer_state_index, player_state_index,
                       ns_hit_index] += 1

        if action == environment.Action.STICK:
            state_info[dealer_state_index, player_state_index,
                       ns_stick_index] += 1

        # calculate delta
        if action == environment.Action.HIT:
            qValue = state_info[dealer_state_index, player_state_index,
                                q_hit_index]
        else:
            qValue = state_info[dealer_state_index, player_state_index,
                                q_stick_index]

        if state_new.terminated:
            q_value_new = 0

        else:
            if action_new == environment.Action.HIT:
                q_value_new = state_info[dealer_state_index_new,
                                         player_state_index_new, q_hit_index]
            else:
                q_value_new = state_info[dealer_state_index_new,
                                         player_state_index_new, q_stick_index]

        delta = reward + q_value_new - qValue

        # increment eligibility trace
        alpha = None
        if action == environment.Action.HIT:
            alpha = 1 / state_info[dealer_state_index, player_state_index,
                                   ns_hit_index]
            state_info[dealer_state_index, player_state_index,
                       e_hit_index] += 1
        else:
            alpha = 1 / state_info[dealer_state_index, player_state_index,
                                   ns_stick_index]
            state_info[dealer_state_index, player_state_index,
                       e_stick_index] += 1

        # update all values
        state_info[:, :,
                   q_hit_index] += alpha * delta * state_info[:, :,
                                                              e_hit_index]
        state_info[:, :,
                   q_stick_index] += alpha * delta * state_info[:, :,
                                                                e_stick_index]

        # update all eligibility traces
        state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index]
        state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index]

        # end this step
        state = state_new
        action = action_new
예제 #19
0
x = np.array([])
y = np.array([])
win_array = np.array([])
win_sum = 0

#agent.load('final_agent')

for cnt2 in range(1):
    turny = 0
    win = 0
    print("試行回数" + str(cnt2+1))
    for cnt in range(1):
        turn = 0 
        while not done:
            action = agent.act_and_train(obs, r)
            obs, r, done, info = env.step(action)
            turn += 1
            if r == 10:
                win += 1
    
        turny += turn
        
        agent.stop_episode_and_train(obs, r, done)
        obs = env.reset()
        r = 0
        done = False  

    x = np.append(x, cnt2)     
    y = np.append(y, turny/1000)
    win_array = np.append(win_array, win)
    win_sum += win
예제 #20
0
def linear_function_approximation(l=0.9,
                                  max_episodes=1000,
                                  policy=policies.epsilon_greedy_lfa,
                                  n_zero=100,
                                  gamma=1,
                                  plot_learning_curve=True,
                                  multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36) * 0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta,
                                             state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(
            state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(
                state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(
                        utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card,
                                    action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100,
                                  gamma=1, plot_learning_curve=True, multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36)*0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta, state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function
예제 #22
0
def sarsa_lambda(l=0.9,
                 max_episodes=1000,
                 policy=policies.epsilon_greedy,
                 n_zero=100,
                 gamma=1,
                 plot_learning_curve=True,
                 multiproc=True):
    """ Applies eligibility trace version of Sarsa to the game Easy21

    :param l: lambda parameter
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param gamma: discounting rate
    :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # no. of wins to calculate the percentage of wins at the end
    wins = 0

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # current (player, dealer, action)
        eligibility_trace = defaultdict(float)

        # initial state, action [SA..]
        state = environment.State()
        player_current = state.player_sum
        dealer_current = state.dealer_first_card
        epsilon = n_zero / float(n_zero + counter_state[
            (player_current, dealer_current)])
        action_current = policy(epsilon, value_function, state)

        while not state.terminal:

            # update counts
            counter_state[(player_current, dealer_current)] += 1
            counter_state_action[(player_current, dealer_current,
                                  action_current)] += 1

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            player_next = state.player_sum
            dealer_next = state.dealer_first_card
            epsilon = n_zero / float(n_zero +
                                     counter_state[(player_next, dealer_next)])
            action_next = policy(epsilon, value_function, state)

            delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \
                value_function[(player_current, dealer_current, action_current)]

            alpha = 1.0 / counter_state_action[(player_current, dealer_current,
                                                action_current)]

            eligibility_trace[(player_current, dealer_current,
                               action_current)] += 1

            # update the values
            for key in value_function:
                value_function[key] += alpha * delta * eligibility_trace[key]
                eligibility_trace[key] *= gamma * l

            player_current = player_next
            dealer_current = dealer_next
            action_current = action_next

        # use it later to calculate the percentage of wins
        if reward == 1:
            wins += 1

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curve
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    # get the percentage of wins
    print float(wins) / max_episodes
    return value_function
예제 #23
0
def anime():
    environment.step()
    for i in range(0,number_ants):
        afficher(environment.population[i])
    if active==True:
    	fenetre.after(5,anime)
예제 #24
0
 def expansion(self, a):
     '''Expands tree from current leaf node with action a.
        Returns expanded node'''
     s_prime, r, terminate = step(self.s, a)
     self.children[a] = Node(self.network, s=s_prime.reverse_player_positions(), parent=self, prev_a=a, prev_r=r, terminate=terminate)
     return self.children[a]
예제 #25
0
파일: part4.py 프로젝트: dteoh/easy21
def lfa_sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, alpha=0.01, yield_progress=False):

    # Set up the coarse codes, initial weights.
    action_codes = {}
    for action in list(Action):
        action_fns = []
        for dealer_interval in [(1,4), (4,7), (7,10)]:
            for player_interval in [(1,6), (4,9), (7,12), (10,15), (13,18), (16,21)]:
                cuboid_fn = create_cuboid_fn(dealer_interval, player_interval, action)
                action_fns.append(cuboid_fn)
        action_codes[action] = action_fns

    def greedy(s, w):
        p, d = s
        action_values = []
        for a in list(Action):
            value = 0
            for cuboid_fn in action_codes[a]:
                if cuboid_fn(p, d, a):
                    value += w.get(cuboid_fn, 0)
            action_values.append((a, value))
        action_values.sort(key=itemgetter(1), reverse=True)
        return action_values[0][0]

    def e_greedy(s, w, epsilon=0.05):
        a_best = greedy(s, w)
        selection_probs = []
        default_p = epsilon / len(Action)
        for a in list(Action):
            if a is a_best:
                selection_probs.append(1 - epsilon + default_p)
            else:
                selection_probs.append(default_p)
        return sample_action(selection_probs)

    def f_sa(s, a):
        p, d = s
        for cuboid_fn in action_codes[a]:
            if cuboid_fn(p, d, a):
                yield cuboid_fn

    def compile_q_sa(w):
        q_sa = {}
        for (p, d), a in generate_all_state_action_pairs():
            sa = (p, d, a)
            val = 0
            for i in f_sa((p, d), a):
                val += w.get(i, 0)
            q_sa[sa] = val
        return q_sa

    w_f = {}
    for n in range(num_episodes):
        state = State()
        s = state.as_tuple()
        a = e_greedy(s, w_f)
        z_f = {}
        while not state.terminal:
            state, reward = step(state, a)
            delta = reward
            for i in f_sa(s, a):
                delta = delta - w_f.get(i, 0)
                z_f[i] = z_f.get(i, 0) + 1
            if state.terminal:
                for i, zi in z_f.items():
                    w_f[i] = w_f.get(i, 0) + alpha * delta * zi
                break
            s_next = state.as_tuple()
            a_next = e_greedy(s_next, w_f)
            for i in f_sa(s_next, a_next):
                delta = delta + gamma * w_f.get(i, 0)
            for i, zi in z_f.items():
                w_f[i] = w_f.get(i, 0) + alpha * delta * zi
                z_f[i] = gamma * lamba * zi
            s = s_next
            a = a_next
        if yield_progress:
            yield n+1, compile_q_sa(w_f)

    if not yield_progress:
        yield num_episodes, compile_q_sa(w_f)