示例#1
0
    done = False
    rewards_current_episode = 0

    print(f"{episode + 1} / {num_episodes}")

    # Play episode
    for step in range(max_steps_per_episode):
        # Exploration vs Exploitation
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = np.random.choice([x for x in range(action_space_size)])

        # Step
        new_state, reward, done, feedback = env.step(action)
        new_state = state_to_coord(new_state)

        # Update Q table
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
                                 learning_rate * (reward + feedback + discount_rate * np.max(q_table[new_state, :]))

        # state = new state and accumulate rewards
        state = new_state
        rewards_current_episode += reward

        if done == True:
            break
    # End of episode
    # Update exploration rate
    exploration_rate = min_exploration_rate + \
示例#2
0
# Pre-train
#agent.replay.tree.start = start
for i in range(k1):
    if i % 100 == 0:
        print("pretraining:", i)
    agent.learn()

# Train
accumulated_rewards_all_episodes = []
for episode in range(k2):
    s = env.reset()
    accumulated_rewards = 0
    done = False
    while not done:
        a = agent.choose_action(s)
        s_, r, done, feedback = env.step(a[0])

        accumulated_rewards += r
        r += feedback

        if done:
            r = 0  # todo not sure if this is necessary, just try with, without, with different value

        agent.store_transition(s, a, r, s_, done)
        agent.learn()
        s = s_

    # Update exploration rate
    """
    agent.eps = min_exploration_rate + \
                (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)