done = False rewards_current_episode = 0 print(f"{episode + 1} / {num_episodes}") # Play episode for step in range(max_steps_per_episode): # Exploration vs Exploitation exploration_rate_threshold = random.uniform(0, 1) if exploration_rate_threshold > exploration_rate: action = np.argmax(q_table[state, :]) else: action = np.random.choice([x for x in range(action_space_size)]) # Step new_state, reward, done, feedback = env.step(action) new_state = state_to_coord(new_state) # Update Q table q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \ learning_rate * (reward + feedback + discount_rate * np.max(q_table[new_state, :])) # state = new state and accumulate rewards state = new_state rewards_current_episode += reward if done == True: break # End of episode # Update exploration rate exploration_rate = min_exploration_rate + \
# Pre-train #agent.replay.tree.start = start for i in range(k1): if i % 100 == 0: print("pretraining:", i) agent.learn() # Train accumulated_rewards_all_episodes = [] for episode in range(k2): s = env.reset() accumulated_rewards = 0 done = False while not done: a = agent.choose_action(s) s_, r, done, feedback = env.step(a[0]) accumulated_rewards += r r += feedback if done: r = 0 # todo not sure if this is necessary, just try with, without, with different value agent.store_transition(s, a, r, s_, done) agent.learn() s = s_ # Update exploration rate """ agent.eps = min_exploration_rate + \ (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)