示例#1
0
def q_visual(environment, dqn, save_id):
    q_visualiser = QValueVisualiser(environment, magnification=500)
    q_values = np.zeros((10, 10, 4))

    for row in range(10):
        for col in range(10):
            x = col/10 + 0.05
            y = row/10 + 0.05
            state = (y, x)
            q_values[row, col, :] = dqn.get_q_values(state).tolist()[0]
    
    q_visualiser.draw_q_values(q_values)
    q_values_image = q_visualiser.get_image()
    path = os.getcwd()
    cv2.imwrite(os.path.join(path, '..\\Figures\\QValuesImage'+save_id+'.png'), q_values_image)
    cv2.waitKey(0)
示例#2
0
def train_epsilon_greedy():
    # Initialise some parameters
    num_eps = 2000
    ep_length = 100

    # Create an environment.
    # If display is True, then the environment will be displayed after every agent step. This can be set to False to speed up training time. The evaluation in part 2 of the coursework will be done based on the time with display=False.
    # Magnification determines how big the window will be when displaying the environment on your monitor. For desktop monitors, a value of 1000 should be about right. For laptops, a value of 500 should be about right. Note that this value does not affect the underlying state space or the learning, just the visualisation of the environment.
    environment = Environment(display=True, magnification=500)
    q_vis = QValueVisualiser(environment)
    agent = Agent(environment)
    buffer = ReplayBuffer()

    # Initialise Q network and target network
    dqn = DQN()

    losses = []
    iterations = []

    fig, ax = plt.subplots()
    ax.set(
        xlabel="Iteration",
        ylabel="Loss",
        title="Loss Curve for DQN (epsilon-greedy)",
    )

    # Loop over episodes
    for ep_idx in tqdm(range(num_eps)):
        # Reset the environment for the start of the episode.
        agent.reset()
        # Initialise average loss for this episode
        episode_loss_average = 0
        # Loop over steps within this episode. The episode length here is 20.
        for step_num in range(ep_length):
            # Step the agent once, and record the transition tuple
            policy = dqn.get_greedy_policy()
            transition = agent.step(policy)
            buffer.record_transition(transition)

            # Train the Q NETWORK on a random sample from the replay buffer
            if buffer.length() >= 200:
                training_sample = buffer.sample_random_transitions(200)
                loss = dqn.train_network(training_sample)

                # update the episode loss average
                episode_loss_average += (loss -
                                         episode_loss_average) / (step_num + 1)

        if buffer.length() >= 200:
            iterations.append(ep_idx)
            losses.append(episode_loss_average)

        if ep_idx != 0 and ep_idx % 400 == 0:
            dqn.decrease_learning_rate(5)

    ax.plot(iterations, losses, color="blue")
    plt.yscale("log")
    plt.show()
    fig.savefig("epsilon_loss_curve.png")

    final_q_values = dqn.get_q_values()
    q_vis.draw_q_values(final_q_values, "4a.png")

    # compute greedy policy based on q values
    policy = np.zeros((10, 10, 2), dtype=np.float32)
    for row in range(10):
        for col in range(10):
            action_values = final_q_values[row, col]
            best_action_idx = np.argmax(action_values)
            policy[row,
                   col] = agent._discrete_action_to_continuous(best_action_idx)
    environment.draw_greedy_policy(policy, 25, "4b.png")
            transition = agent.step()
            replbuff.append(transition)

            if (replbuff.len() >= 100):
                #Get minibatch of size 100
                save = True
                minibatch = replbuff.sample(100)
                loss = dqn.train_q_network(minibatch)
                total_loss += loss

            # Sleep, so that you can observe the agent moving. Note: this line should be removed when you want to speed up training
            #time.sleep(0.2)

        if save:
            #For plotting
            if (episode % target_update_frequency == 0) & (episode != 0):
                dqn.update_target()
            losses.append(total_loss / 20)
            iterations.append(episode + 1)
    qs = dqn.get_q_values()
    q_visualizer = QValueVisualiser(environment, magnification=500)
    q_visualizer.draw_q_values(qs.reshape(10, 10, 4), '8_2')
    env2 = Environment(display=True, magnification=500)
    env2.draw_greedy_policy(qs.reshape(10, 10, 4), 10, '8_2')

    plt.clf()
    plt.plot(iterations, losses, color='blue')
    plt.yscale('log')
    plt.savefig("loss_exercise82.png")
    plt.show()
    #cv2.destroyAllWindows()
示例#4
0
    # GET FINAL Q VALUES FOR EACH STATE (EXTRACT FINAL LAYER FROM NETWORK, WITH EXISTING WEIGHTS I THINK?)

    #PROBABLY THIS RESHAPE ISN'T CORRECT, BUT SOMEHOW NEED TO WORK OUT HOW TO CORRECTLY RESHAPE FROM 4X100 TO 10X10X4.
    q_values = np.zeros((10, 10, 4))
    for row in range(10):
        for col in range(10):
            state_tensor = torch.tensor([0.1 * col + 0.05, 0.1 * row + 0.05])
            q_values[col,
                     row, :] = dqn.q_network(state_tensor).detach().numpy()

    #q_values = torch.reshape( (dqn.q_network.output_layer.weight.data).t(), (10,10,4)).numpy()
    print(q_values.shape)
    # Draw final Q pattern
    visualiser = QValueVisualiser(
        environment=environment,
        magnification=500,
        title="Qvalues_bellman.png"
    )  #not sure if this is working correctly or not???
    # Draw the image
    visualiser.draw_q_values(
        q_values
    )  #expecting  q_values  =  np.random.uniform(0, 1, [10, 10, 4])

    # FIND GREEDY POLICY.
    # Find the greedy policy: Given a starting state, what is the max Q action, then move to that state and take next max Q to move there until end of episode.
    agent.reset()
    states = []
    transition = (agent.step())  # get the starting state
    state = torch.tensor(transition[0])
    states.append(state)
    for i in range(EPISODE_LENGTH - 1):
            if len(my_buffer.buffer) >= epsilon_start:
                minibatch = my_buffer.minibatch(batch_size)
                #loss += dqn.train_q_network(minibatch)
                dqn.train_q_network(minibatch)

        if (epoch + 1 % update_net == 0):
            dqn.update_target_network()

    all_states = np.zeros([100, 2])
    i = 0
    for col in range(10):
        for row in range(10):
            all_states[i, 0] = col / 10 + 0.05
            all_states[i, 1] = row / 10 + 0.05
            i += 1
    states_tensor = torch.tensor(all_states, dtype=torch.float32)
    #print(states_tensor)
    q_values = dqn.q_network.forward(states_tensor).detach().numpy()
    #print(qs)
    q_values = np.reshape(q_values, (10, 10, 4))

    # Create an environment
    environment = Environment(display=False, magnification=500)
    # Create a visualiser
    visualiser = QValueVisualiser(environment=environment, magnification=500)
    # Draw the image
    visualiser.draw_q_values(q_values)

    environment.draw_greedy_line(q_values)
示例#6
0
        
        losses[episode] = np.average(episode_losses)
        print("Finished episode {}, average loss = {}".format(episode, losses[episode]))

    
    # evaluate Q-value
    q_values = np.zeros((10, 10, 4))
    for col in range(10):
        x = col / 10 + 0.05
        for row in range(10):
            y = row / 10 + 0.05
            loc = torch.tensor([[x, y]], dtype=torch.float32)
            q_value = dqn.q_network.forward(loc)
            q_values[col, row] = q_value.detach().numpy()

    visualiser = QValueVisualiser(environment=environment, magnification=500)
    # Draw the image
    visualiser.draw_q_values(q_values, filename="q_values_epsilon_greedy.png")

    # draw greedy policy
    image = environment.draw(environment.init_state)
    loc = environment.init_state
    for _ in range(20):
        q = dqn.q_network.forward(torch.tensor(loc, dtype=torch.float32))
        greedy_direction = np.argmax(q.detach().numpy())
        action = agent._discrete_action_to_continuous(greedy_direction)
        next_loc, _ = environment.step(loc, action)
        loc_tuple = (int(loc[0] * environment.magnification),
                     int((1 - loc[1]) * environment.magnification))
        next_loc_tuple = (int(next_loc[0] * environment.magnification),
                          int((1 - next_loc[1]) * environment.magnification))
示例#7
0
    #PROBABLY THIS RESHAPE ISN'T CORRECT, BUT SOMEHOW NEED TO WORK OUT HOW TO CORRECTLY RESHAPE FROM 4X100 TO 10X10X4.
    #Since we are in continuous space we need to create a grid of states that we want to visualise and pass these through the network to get their q values:
    q_values = np.zeros((10, 10, 4))
    for row in range(10):
        for col in range(10):
            state_tensor = torch.tensor([0.1 * col + 0.05, 0.1 * row + 0.05])
            q_values[col,
                     row, :] = dqn.q_network(state_tensor).detach().numpy()

    #q_values = torch.flip ( torch.reshape( (dqn.q_network.output_layer.weight.data).t(), (10,10,4)),  [0,1]).numpy()
    print(q_values.shape)
    # Draw final Q pattern
    visualiser = QValueVisualiser(
        environment=environment,
        magnification=500,
        starting_pos=(
            0.3500, 0.1500))  #not sure if this is working correctly or not???
    # Draw the image
    visualiser.draw_q_values(
        q_values
    )  #expecting  q_values  =  np.random.uniform(0, 1, [10, 10, 4])

    #FIND GREEDY POLICY.
    #Find the greedy policy: Given a starting state, what is the max Q action, then move to that state and take next max Q to move there until end of episode.
    agent.reset()
    states = []
    transition = (agent.step())  # get the starting state
    state = torch.tensor(transition[0])
    states.append(state)
    for i in range(EPISODE_LENGTH - 1):