예제 #1
0
def q_learning_table_agent(env, num_episodes=500, learning_rate=0.9, discount=0.95):
    # remember rewards for each step of each episode
    explainer = Explainer()

    # create reward table
    # that associates conditions, actions and rewards
    reward_table = np.zeros((5, 2))

    # run episodes
    for episode in range(num_episodes):
        # create new explainer episode
        explainer.new_episode(episode)

        # reset env and get starting state
        state = env.reset()

        # run until env is finished
        done = False
        while not done:
            # get previous action
            prev_reward = reward_table[state, :]
            # if no previous record, choose random action
            if sum(prev_reward) == 0:
                action = np.random.randint(0, 2)
            # choose action with best reward
            else:
                action = np.argmax(prev_reward)

            # perform step
            new_state, reward, done, _ = env.step(action)

            # save reward
            # reward_table[state, action] += reward
            # Q(s, a) = Q(s, a) + a*(r + y * max(Q(s', a')) - Q(s, a))

            reward_table[state, action] += \
                reward + learning_rate * \
                (discount * max(reward_table[new_state, :]) - reward_table[state, action])

            # update state
            state = new_state

            # save stats
            explainer.save(episode, reward)

    return reward_table, explainer
예제 #2
0
def naive_sum_reward_agent(env, num_episodes=500):
    # remember rewards for each step of each episode
    explainer = Explainer()

    # create reward table
    # that associates conditions, actions and rewards
    reward_table = np.zeros((5, 2))

    # run episodes
    for episode in range(num_episodes):
        # create new explainer episode
        explainer.new_episode(episode)

        # reset env and get starting state
        state = env.reset()

        # run until env is finished
        done = False
        while not done:
            # get previous action
            prev_reward = reward_table[state]
            # if no previous record, choose random action
            if sum(prev_reward) == 0:
                action = np.random.randint(0, 2)
            # choose action with best reward
            else:
                action = np.argmax(prev_reward)

            # perform step
            new_state, reward, done, _ = env.step(action)

            # save reward
            reward_table[state, action] += reward

            # update state
            state = new_state

            # save stats
            explainer.save(episode, reward)

    return reward_table, explainer
예제 #3
0
def q_learning_table_rand_greedy_agent(num_episodes=500,
                                       learning_rate=0.8,
                                       discount=0.95,
                                       rand=0.5,
                                       rand_from=0,
                                       rand_to=250,
                                       show_every=False):
    opts = [learning_rate, discount, rand, rand_from, rand_to].copy()

    # create env
    env = gym.make('NChain-v0')

    # get rand decay vlaue
    rand_decay = rand / (rand_to - rand_from)

    # remember rewards for each step of each episode
    explainer = Explainer()

    # create reward table
    # that associates conditions, actions and rewards
    reward_table = np.random.uniform(low=-2, high=0, size=(5, 2))

    # run episodes
    for episode in range(num_episodes):
        # create new explainer episode
        explainer.new_episode(episode)

        # reset env and get starting state
        state = env.reset()

        # decay rand
        if episode >= rand_from and episode < rand_to:
            rand -= rand_decay
        else:
            rand = 0

        # show episode number
        if show_every and episode % show_every == 0:
            print(episode)

        # run until env is finished
        done = False
        while not done:
            if np.random.random() < rand:
                # if no previous record, choose random action
                action = np.random.randint(0, 2)
            else:
                # choose previous action with best reward
                action = np.argmax(reward_table[state])

            # perform step
            new_state, reward, done, _ = env.step(action)

            # save reward
            reward_table[state, action] += \
                reward + learning_rate * \
                (discount *
                 np.max(reward_table[new_state, :]) - reward_table[state, action])

            # update state
            state = new_state

            # save stats
            explainer.save(episode, reward)

            # if done and new_state_n[0] >= env.goal_position:
            #     print('reach!')

    return reward_table, explainer, opts