Exemplo n.º 1
0
t_min, t_max = 0.4, 0.6

# Algorithm
rewards_all_episodes = []

# Stats
deleted_all_episodes = []
corrected_all_episodes = []
incorrect_all_episodes = []
for episode in range(num_episodes):
    # Repair episode initialization
    R_trajectory = 0
    trajectory = []

    # Q Learning episode initialization
    state_v = env.reset()
    state_c = state_to_coord(state_v)

    done = False

    print(f"{episode + 1} / {num_episodes}")

    # Play episode
    for step in range(max_steps_per_episode):
        # Exploration vs Exploitation
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state_c, :])
        else:
            action = np.random.choice([x for x in range(action_space_size)])
Exemplo n.º 2
0
max_steps_per_episode = 1000000

learning_rate = 0.1  # alpha
discount_rate = 0.99  # gamma

exploration_rate = 1  # epsilon
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# Algorithm
rewards_all_episodes = []

# Q learning
for episode in range(num_episodes):
    state = env.reset()
    state = state_to_coord(state)

    done = False
    rewards_current_episode = 0

    print(f"{episode + 1} / {num_episodes}")

    # Play episode
    for step in range(max_steps_per_episode):
        # Exploration vs Exploitation
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = np.random.choice([x for x in range(action_space_size)])
Exemplo n.º 3
0
    agent.store_demonstration(s, a, r, s_, done, int(episode_idx))

    if done:
        episode_idx += 1

# Pre-train
#agent.replay.tree.start = start
for i in range(k1):
    if i % 100 == 0:
        print("pretraining:", i)
    agent.learn()

# Train
accumulated_rewards_all_episodes = []
for episode in range(k2):
    s = env.reset()
    accumulated_rewards = 0
    done = False
    while not done:
        a = agent.choose_action(s)
        s_, r, done, feedback = env.step(a[0])

        accumulated_rewards += r
        r += feedback

        if done:
            r = 0  # todo not sure if this is necessary, just try with, without, with different value

        agent.store_transition(s, a, r, s_, done)
        agent.learn()
        s = s_