def q_visual(environment, dqn, save_id): q_visualiser = QValueVisualiser(environment, magnification=500) q_values = np.zeros((10, 10, 4)) for row in range(10): for col in range(10): x = col/10 + 0.05 y = row/10 + 0.05 state = (y, x) q_values[row, col, :] = dqn.get_q_values(state).tolist()[0] q_visualiser.draw_q_values(q_values) q_values_image = q_visualiser.get_image() path = os.getcwd() cv2.imwrite(os.path.join(path, '..\\Figures\\QValuesImage'+save_id+'.png'), q_values_image) cv2.waitKey(0)
def train_epsilon_greedy(): # Initialise some parameters num_eps = 2000 ep_length = 100 # Create an environment. # If display is True, then the environment will be displayed after every agent step. This can be set to False to speed up training time. The evaluation in part 2 of the coursework will be done based on the time with display=False. # Magnification determines how big the window will be when displaying the environment on your monitor. For desktop monitors, a value of 1000 should be about right. For laptops, a value of 500 should be about right. Note that this value does not affect the underlying state space or the learning, just the visualisation of the environment. environment = Environment(display=True, magnification=500) q_vis = QValueVisualiser(environment) agent = Agent(environment) buffer = ReplayBuffer() # Initialise Q network and target network dqn = DQN() losses = [] iterations = [] fig, ax = plt.subplots() ax.set( xlabel="Iteration", ylabel="Loss", title="Loss Curve for DQN (epsilon-greedy)", ) # Loop over episodes for ep_idx in tqdm(range(num_eps)): # Reset the environment for the start of the episode. agent.reset() # Initialise average loss for this episode episode_loss_average = 0 # Loop over steps within this episode. The episode length here is 20. for step_num in range(ep_length): # Step the agent once, and record the transition tuple policy = dqn.get_greedy_policy() transition = agent.step(policy) buffer.record_transition(transition) # Train the Q NETWORK on a random sample from the replay buffer if buffer.length() >= 200: training_sample = buffer.sample_random_transitions(200) loss = dqn.train_network(training_sample) # update the episode loss average episode_loss_average += (loss - episode_loss_average) / (step_num + 1) if buffer.length() >= 200: iterations.append(ep_idx) losses.append(episode_loss_average) if ep_idx != 0 and ep_idx % 400 == 0: dqn.decrease_learning_rate(5) ax.plot(iterations, losses, color="blue") plt.yscale("log") plt.show() fig.savefig("epsilon_loss_curve.png") final_q_values = dqn.get_q_values() q_vis.draw_q_values(final_q_values, "4a.png") # compute greedy policy based on q values policy = np.zeros((10, 10, 2), dtype=np.float32) for row in range(10): for col in range(10): action_values = final_q_values[row, col] best_action_idx = np.argmax(action_values) policy[row, col] = agent._discrete_action_to_continuous(best_action_idx) environment.draw_greedy_policy(policy, 25, "4b.png")
transition = agent.step() replbuff.append(transition) if (replbuff.len() >= 100): #Get minibatch of size 100 save = True minibatch = replbuff.sample(100) loss = dqn.train_q_network(minibatch) total_loss += loss # Sleep, so that you can observe the agent moving. Note: this line should be removed when you want to speed up training #time.sleep(0.2) if save: #For plotting if (episode % target_update_frequency == 0) & (episode != 0): dqn.update_target() losses.append(total_loss / 20) iterations.append(episode + 1) qs = dqn.get_q_values() q_visualizer = QValueVisualiser(environment, magnification=500) q_visualizer.draw_q_values(qs.reshape(10, 10, 4), '8_2') env2 = Environment(display=True, magnification=500) env2.draw_greedy_policy(qs.reshape(10, 10, 4), 10, '8_2') plt.clf() plt.plot(iterations, losses, color='blue') plt.yscale('log') plt.savefig("loss_exercise82.png") plt.show() #cv2.destroyAllWindows()
for col in range(10): state_tensor = torch.tensor([0.1 * col + 0.05, 0.1 * row + 0.05]) q_values[col, row, :] = dqn.q_network(state_tensor).detach().numpy() #q_values = torch.reshape( (dqn.q_network.output_layer.weight.data).t(), (10,10,4)).numpy() print(q_values.shape) # Draw final Q pattern visualiser = QValueVisualiser( environment=environment, magnification=500, title="Qvalues_bellman.png" ) #not sure if this is working correctly or not??? # Draw the image visualiser.draw_q_values( q_values ) #expecting q_values = np.random.uniform(0, 1, [10, 10, 4]) # FIND GREEDY POLICY. # Find the greedy policy: Given a starting state, what is the max Q action, then move to that state and take next max Q to move there until end of episode. agent.reset() states = [] transition = (agent.step()) # get the starting state state = torch.tensor(transition[0]) states.append(state) for i in range(EPISODE_LENGTH - 1): Q = dqn.q_network(state).detach() # get all 4 q values for that state max_action = torch.argmax(Q) # Convert the discrete action into a continuous action. continuous_action = agent._discrete_action_to_continuous(max_action) # Take one step in the environment, based on the agent's current state.
if len(my_buffer.buffer) >= epsilon_start: minibatch = my_buffer.minibatch(batch_size) #loss += dqn.train_q_network(minibatch) dqn.train_q_network(minibatch) if (epoch + 1 % update_net == 0): dqn.update_target_network() all_states = np.zeros([100, 2]) i = 0 for col in range(10): for row in range(10): all_states[i, 0] = col / 10 + 0.05 all_states[i, 1] = row / 10 + 0.05 i += 1 states_tensor = torch.tensor(all_states, dtype=torch.float32) #print(states_tensor) q_values = dqn.q_network.forward(states_tensor).detach().numpy() #print(qs) q_values = np.reshape(q_values, (10, 10, 4)) # Create an environment environment = Environment(display=False, magnification=500) # Create a visualiser visualiser = QValueVisualiser(environment=environment, magnification=500) # Draw the image visualiser.draw_q_values(q_values) environment.draw_greedy_line(q_values)
print("Finished episode {}, average loss = {}".format(episode, losses[episode])) # evaluate Q-value q_values = np.zeros((10, 10, 4)) for col in range(10): x = col / 10 + 0.05 for row in range(10): y = row / 10 + 0.05 loc = torch.tensor([[x, y]], dtype=torch.float32) q_value = dqn.q_network.forward(loc) q_values[col, row] = q_value.detach().numpy() visualiser = QValueVisualiser(environment=environment, magnification=500) # Draw the image visualiser.draw_q_values(q_values, filename="q_values_epsilon_greedy.png") # draw greedy policy image = environment.draw(environment.init_state) loc = environment.init_state for _ in range(20): q = dqn.q_network.forward(torch.tensor(loc, dtype=torch.float32)) greedy_direction = np.argmax(q.detach().numpy()) action = agent._discrete_action_to_continuous(greedy_direction) next_loc, _ = environment.step(loc, action) loc_tuple = (int(loc[0] * environment.magnification), int((1 - loc[1]) * environment.magnification)) next_loc_tuple = (int(next_loc[0] * environment.magnification), int((1 - next_loc[1]) * environment.magnification)) cv2.line(image, loc_tuple, next_loc_tuple, (0,255,0), thickness=5)