Пример #1
0
    if done:  # reset game and
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0

    if len(
            replay_buffer
    ) > replay_initial:  #if number of plays has reached the limit calculate loss and optimize update model
        loss = compute_td_loss(model, target_model, batch_size, gamma,
                               replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(
            replay_buffer) <= replay_initial:  #two ifs are just for printing
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)  #updates target model
        print("saved modelRe")
        torch.save(model.state_dict(), "modelRe.pth")
        savetxt('rewards.csv', all_rewards, delimiter=',')
        savetxt('losses.csv', losses, delimiter=',')
Пример #2
0
    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data.cpu().numpy())

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: {}, preparing replay buffer'.format(frame_idx))

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: {}, Loss: {}'.format(frame_idx, np.mean(losses)))
        print('Last-10 average reward: {}'.format(np.mean(all_rewards[-10:])))

    if frame_idx % 500000 == 0:
        checkpoint = {
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'frame_idx': frame_idx,
            'losses': losses,
            'all_rewards': all_rewards,
            # 'replay_buffer' : replay_buffer
        }

        torch.save(checkpoint, "./checkpoints/{}_checkpoint".format(frame_idx))
Пример #3
0
        episode_reward = 0  # reset

    # Once the replay buffer has filled up enough
    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(
            model, target_model, batch_size, gamma,
            replay_buffer)  # calculate the loss for the state
        optimizer.zero_grad()  # reset gradient values
        loss.backward()  # backpropogate loss
        optimizer.step()  # Updates weight values
        losses.append(
            (frame_idx, loss.data.cpu().numpy()))  # hold loss in array

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)
        torch.save(model.state_dict(), "run11_start.pth")

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])
        filename = "run11_model" + str(frame_idx) + ".pth"
        torch.save(model.state_dict(), filename)
        lossScript = open("run11_losses.txt", "w")
        rewardScript = open("run11_rewards.txt", "w")
        lossScript.write(str(losses))
        rewardScript.write(str(all_rewards))
        lossScript.close()
        rewardScript.close()

    if frame_idx % 50000 == 0:
        target_model.copy_from(
Пример #4
0
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

        if (np.mean(all_rewards[-10:], 0)[1] > currMax):
            print("Saving model...")
            torch.save(model.state_dict(), filename)
            currMax = np.mean(all_rewards[-10:], 0)[1]
        else:
            print("not saving")


    if frame_idx % 20000 == 0:
        print("Copying from model...")
        target_model.copy_from(model)
    


Пример #5
0
    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, target_model, batch_size, gamma,
                               replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])
        times.append(frame_idx)
        value_losses.append(np.mean(losses, 0)[1])
        value_all_rewards.append(np.mean(all_rewards[-10:], 0)[1])

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)

# save losses and rewards to txt
open('data_losses.txt', 'w').write(str(losses))
open('data_all_rewards.txt', 'w').write(str(all_rewards))

# draw two plots
draw_plot()

# save the trained model
torch.save(model.state_dict(), "trained_model_revised.pth")
Пример #6
0
    state = next_state                      # Change to next state
    episode_reward += reward                # Keep adding rewards until goal state

    if done:                                # Goal state
        state = env.reset()                 # Restart game
        all_rewards.append((frame_idx, episode_reward))  # Store episode_reward w frame it ended
        episode_reward = 0

    if len(replay_buffer) > replay_initial:     # If enough frames in replay_buffer (10000)
        loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()                   # Resets gradient after every mini-batch
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0:
        if len(replay_buffer) <= replay_initial:  # If frames still needed in replay_buffer
            print('#Frame: %d, preparing replay buffer' % frame_idx)

        else:                   # If enough frames in replay_buffer
            print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
            print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])
            plot_model(losses, all_rewards)

            if best_mean_reward < np.mean(all_rewards[-10:], 0)[1]:
                best_mean_reward = np.mean(all_rewards[-10:], 0)[1]
                torch.save(model.state_dict(), f"models/model_r={best_mean_reward}_f={frame_idx}.pth")

    if frame_idx % sync_models_at_frame == 0:
        target_model.copy_from(model)       # Copy model's weights onto target after number of frames
Пример #7
0
batch_size = 32
gamma = 0.99
target_update = 50000
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 1000000
replay_initial = 10000
learning_rate = 1e-5
train_replay_buffer = ReplayBuffer(100000)
analysis_replay_buffer = ReplayBuffer(100000)

policy_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model.load_state_dict(policy_model.state_dict())
target_model.eval()

optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if USE_CUDA:
    policy_model = policy_model.to(device)
    target_model = target_model.to(device)

epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)


def play_to_train(num_frames, policy_model, target_model, buffer):
    losses = []
    all_rewards = []
Пример #8
0
        with open("rewards.txt", "w") as output:
            for row in all_rewards:
                output.write(str(row[0]) + "," + str((row[1])) + "\n")

        with open("losses.txt", "w") as output:
            for row in losses:
                output.write(str(row[0]) + "," + str((row[1])) + "\n")

        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        gamma1 = gamma * (10000 / frame_idx)
        loss = compute_td_loss(model, target_model, batch_size, gamma1,
                               replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)
        torch.save(model.state_dict(), "model2.pth")

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])
        torch.save(model.state_dict(), "model2.pth")

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)
Пример #9
0
    state = next_state
    episode_reward += reward
    
    if done:
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

        torch.save(model.state_dict(),'my_trained.pth') # saving our model every 10,000 frames

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)

np.save('losses.npy',losses)                #saving the losses and rewards after 2M frames
np.save('rewards.npy',all_rewards)

    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data.cpu().numpy())

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)
        #print(episode_reward)
    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses)))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:]))
        #print(episode_reward)
        plot(frame_idx, all_rewards, losses)

torch.save(model.state_dict(), 'newdqnModel.pt')
#torch.save(model, 'entiremodel.pt')

#model.save_model('ndqnModel.tar',model)
        loss = compute_td_loss(model, target_model, batch_size, gamma,
                               replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)

    if frame_idx % 1000 == 0:
        #PART 4.1
        print("frame: ", frame_idx, " - saving new model")
        torch.save(model.state_dict(), "new_model.pth")
        #PART 4.2
        np.save('losses-itr.npy', losses)
        np.save('rewards-itr.npy', all_rewards)

#PART 4.2 again if loop finishes.
np.save('losses.npy', losses)
np.save('rewards.npy', all_rewards)

# data = np.load('losses.npy')
# print(data)
Пример #12
0
    # print(np.shape(next_state))
    #print(reward)
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        #print(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data.cpu().numpy())

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses)))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:]))

pkl.dump(losses, open("loss.pkl", "wb"))
pkl.dump(all_rewards, open("reward.pkl", "wb"))
torch.save(model.state_dict(), 'model_trained.pt')
Пример #13
0
    state = next_state
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0
        savetxt('rewards.csv', all_rewards, delimiter=',')
        savetxt('losses.csv', losses, delimiter=',')

    if len(replay_buffer) > replay_initial:
        # We write the loss function
        loss = compute_td_loss(model, target_model, batch_size, gamma,
                               replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append((frame_idx, loss.data.cpu().numpy()))

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1]))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])

    if frame_idx % 50000 == 0:
        target_model.copy_from(model)  #update!
        torch.save(model.state_dict(), 'modelsave.pth')
Пример #14
0
    episode_reward += reward

    if done:
        state = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0

    if len(replay_buffer) > replay_initial:
        loss = compute_td_loss(model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data.cpu().numpy())

    if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial:
        print('#Frame: %d, preparing replay buffer' % frame_idx)

    if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial:
        print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses)))
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:]))

        loss_list.append(np.mean(losses))
        reward_list.append(np.mean(all_rewards[-10:]))

sio.savemat('Results.mat', {
    'reward_list': reward_list,
    'loss_list': loss_list
})

torch.save(model.state_dict(), 'trained_model.pth')
Пример #15
0
env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

num_frames = 1000000
batch_size = 32
gamma = 0.99

replay_initial = 10000
replay_buffer = ReplayBuffer(100000)
t_replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
target_model = QLearner(env, num_frames, batch_size, gamma, t_replay_buffer)
target_model.load_state_dict(model.state_dict())

optimizer = optim.Adam(model.parameters(), lr=0.00001)
if USE_CUDA:
    model = model.cuda()
    target_model = target_model.cuda()

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0
Пример #16
0
        print('Frame: %d' % frame_idx)
        print('Loss: %f' % np.mean(losses, 0)[1])
        print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1])
        print('\n\n')
        reported_avg_reward = np.mean(all_rewards[-10:], 0)[1]
        progress = open('progress_lr0001_epsilon50k.txt', 'a')
        progress.write('Frame: %d\n' % (frame_idx))
        progress.write('Loss: %f\n' % (np.mean(losses, 0)[1]))
        progress.write('Last-10 average reward: %f\n' %
                       np.mean(all_rewards[-10:], 0)[1])
        progress.write('Epsilon: %f\n' % epsilon)
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        progress.write('Time: ' + current_time)
        progress.write("\n\n")
        progress.close()

    # EVERY 50K FRAMES, UPDATE TARGET NETWORK WITH CURRENT EXMTL ONE
    if frame_idx % 50000 == 0:
        target_model.copy_from(model)
        # target_model.eval()
        file_out = file_name
        file_out = re.sub('\.pth', '', file_out)
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        file_out = file_out + "_" + current_time + "_" + "epsilon_" + str(
            epsilon) + "_frame_" + str(frame_idx) + ".pth"
        torch.save(model.state_dict(), file_out)

# ESSENTIALLY, BACKPROPOGATE EVERY 10K FRAMES AND UPDATE THE COMPARISON MODEL AFTER 5 UPDATES