while settings.episodes > episode: # Prepare environment for playing env.reset() # Reset or increment values terminal = False episode += 1 step = 0 q_max_arr = [] reward_arr = [] epsilon_arr = [] while not terminal and step < settings.train_step_limit: step += 1 # Get the Q-values of the current state state_row = env.actor_state_row() q_values = q_table.get_state_q(state_row) # Save max(Q(s,a)) for stats q_max = np.max(q_values) # Anneal epsilon if epsilon > settings.final_epsilon: epsilon = settings.initial_epsilon - (2 * episode / float(settings.episodes)) else: # Final epsilon reached, stop annealing. epsilon = settings.final_epsilon # Select action if (np.random.random() < epsilon): # Choose random action