Exemplo n.º 1
0
def main(explore, exploit, trace):
    agent = Agent(pos=(0, 0))               # create an agent at initial position (0,0)
    env = Environment()                     # create an environment
    convert = Converter(env)

    n_a = env.action_space_n                # get the size of action space
    n_s = env.state_space_n                 # get the size of state space

    q_table = np.zeros([n_s, n_a])

    if trace == 2:
        e_table = np.zeros([n_s, n_a])          # initialize the eligibility trace
    else:
        e_table = None
    if explore == 1:
        ex_method = "greedy"
    else:
        ex_method = "softmax"

    n_episode = 1000
    n_timestep = 500

    window = 200
    cleaning_rate = []
    returns = deque(maxlen=window)
    avg_rt_count = []


    # for each episode
    for i_episode in range(n_episode):
        s = convert.state2tile(env.reset())
        a = agent.get_action(s, q_table, method=ex_method)

        # for each epoch
        clean_rate = 0
        for t in range(n_timestep):

            # Act: take a step and receive (new state, reward, termination flag, additional information)
            s_prime, reward, done, info = env.step(a)
            agent.store_transition(s, a, reward)

            # if it is the last episode, print out info (to avoid print out too much)
            if (i_episode == n_episode - 1):
                env.display()
                print(info)

            # Select an action
            '''We need to give method explicitely {"softmax", "greedy"}'''
            good_acts = []
            (_x, _y) = (s_prime[0], s_prime[1])
            if (_x - 1, _y) in env.trashes:
                good_acts.append(0)
            if (_x + 1, _y) in env.trashes:
                good_acts.append(1)
            if (_x, _y - 1) in env.trashes:
                good_acts.append(2)
            if (_x, _y + 1) in env.trashes:
                good_acts.append(3)
            s_prime = convert.state2tile(s_prime)
            a_prime = agent.get_action(s_prime, q_table, good_acts=good_acts, method=ex_method)

            # Update a Q value table
            '''
            Update method is implicitely given according to
            the number of parameters
            '''
            if exploit == 1:
                agent.update(q_table, s, a, reward, s_prime, a_prime = None, e_table=e_table)
            else:
                agent.update(q_table, s, a, reward, s_prime, a_prime, e_table=e_table)

            # Transition to new state
            s = s_prime
            a = a_prime

            if done:
                reward_0 = agent.discounted_return()[0]
                clean_rate = (env.nb_trashes - len(env.trashes)) / env.nb_trashes
                returns.append(reward_0)
                avg_rt_count.append(np.average(returns))
                print("Episode: {0}\t Nb_Steps{1:>4}\t Epsilon: {2:.3f}\t Tau: {3:.3f}\t Clean Rate: {4:.3f}\t Discounted_return: {5:.3f}\t".format(
                    i_episode, t + 1, agent.epsilon, agent.tau, clean_rate, reward_0))
                # print(info)
                break

        agent.ep_rs.clear()
        agent.ep_obs.clear()
        agent.ep_as.clear()

        agent.epsilon = agent.epsilon * agent.epsilon_decay
        agent.tau = agent.init_tau + i_episode * agent.tau_inc
        cleaning_rate.append(clean_rate)

    plt.ioff()
    fig = plt.figure(figsize=(7, 9))
    ax1 = fig.add_subplot(3, 1, 1)
    ax1.scatter(range(n_episode), cleaning_rate, color='r', label="Cleaning rate")
    ax1.legend()
    ax2 = fig.add_subplot(3, 1, 2)
    moving_avg = rolling_mean(cleaning_rate, n = window)
    ax2.plot(range(len(moving_avg)), moving_avg, color='r', label="Rolling average cleaning rate")
    ax2.legend()
    ax3 = fig.add_subplot(3, 1, 3)
    ax3.plot(range(len(avg_rt_count)), avg_rt_count, color='r', label="Rolling average discounted return")
    ax3.legend()
    plt.show()
Exemplo n.º 2
0
n_a = env.action_space_n  # get the action space size
n_s = env.state_space_n  # get the state space size

q_table = np.zeros([n_s, n_a])  # init Q table
e_table = np.zeros([n_s, n_a])  # init eligibility traces

# cleaning rate for each episode
clean_rate = []
crashes = []
crash_count = 0
n_episodes = 1000
n_timesteps = 500

#q_table=agent.NFQ(env, n_timesteps)
for i_epsisode in range(n_episodes):
    s = convert.state2tile(env.reset())
    a = agent.get_action(s, q_table, method="greedy")

    for t in range(n_timesteps):
        s_prime, reward, done, info = env.step(a)
        s_prime = convert.state2tile(s_prime)

        a_prime = agent.get_action(s_prime, q_table, method="greedy")

        agent.update(q_table, s, a, reward, s_prime, a_prime, e_table)

        # Transition to new state
        s = s_prime
        a = a_prime

        #crash = 0