epsilon = settings.initial_epsilon - (2 * episode / float(settings.episodes)) else: # Final epsilon reached, stop annealing. epsilon = settings.final_epsilon # Select action if (np.random.random() < epsilon): # Choose random action action = np.random.randint(0, 4) else: # Choose the action with the highest Q-value action = np.argmax(q_values) # Take action and observe reward and check if state is terminal _, reward, terminal = env.perform_action(action) # Save values for stats epsilon_arr.append(epsilon) reward_arr.append(reward) q_max_arr.append(q_max) # Get the new states Q-values new_state_row = env.actor_state_row() q_values_new = q_table.get_state_q(new_state_row) # Get max(Q(s',a')) to update Q(s,a) q_max_new = np.max(q_values_new) if not terminal: # Non-terminal state, update with reward + gamma * max(Q(s'a') update = reward + (settings.gamma * q_max_new)
epsilon = settings.initial_epsilon - (2 * episode / float(settings.episodes)) else: # Final epsilon reached, stop annealing. epsilon = settings.final_epsilon # Select action if (np.random.random() < epsilon): # Choose random action action = np.random.randint(0, 4) else: # Choose the action with the highest Q-value action = np.argmax(q_values) # Take action and observe new state and reward, check if state is terminal new_state, reward, terminal = env.perform_action(action) # Pop the oldest state new_states = np.delete(states, 0, 0) # Separate the other states into a tuple new_states_tuple = () for i in range(len(new_states)): new_states_tuple += (new_states[i], ) # Add new state to tuple and stack the new states new_states_tuple += (new_state.reshape(1, input_size), ) new_states = np.stack(new_states_tuple) # Get the new state's Q-values q_values_new = rnn_network.predict(new_states)