if done: # reset game and state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len( replay_buffer ) > replay_initial: #if number of plays has reached the limit calculate loss and optimize update model loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len( replay_buffer) <= replay_initial: #two ifs are just for printing print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) if frame_idx % 50000 == 0: target_model.copy_from(model) #updates target model print("saved modelRe") torch.save(model.state_dict(), "modelRe.pth") savetxt('rewards.csv', all_rewards, delimiter=',') savetxt('losses.csv', losses, delimiter=',')
if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.data.cpu().numpy()) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: {}, preparing replay buffer'.format(frame_idx)) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: {}, Loss: {}'.format(frame_idx, np.mean(losses))) print('Last-10 average reward: {}'.format(np.mean(all_rewards[-10:]))) if frame_idx % 500000 == 0: checkpoint = { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'frame_idx': frame_idx, 'losses': losses, 'all_rewards': all_rewards, # 'replay_buffer' : replay_buffer } torch.save(checkpoint, "./checkpoints/{}_checkpoint".format(frame_idx))
episode_reward = 0 # reset # Once the replay buffer has filled up enough if len(replay_buffer) > replay_initial: loss = compute_td_loss( model, target_model, batch_size, gamma, replay_buffer) # calculate the loss for the state optimizer.zero_grad() # reset gradient values loss.backward() # backpropogate loss optimizer.step() # Updates weight values losses.append( (frame_idx, loss.data.cpu().numpy())) # hold loss in array if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) torch.save(model.state_dict(), "run11_start.pth") if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) filename = "run11_model" + str(frame_idx) + ".pth" torch.save(model.state_dict(), filename) lossScript = open("run11_losses.txt", "w") rewardScript = open("run11_rewards.txt", "w") lossScript.write(str(losses)) rewardScript.write(str(all_rewards)) lossScript.close() rewardScript.close() if frame_idx % 50000 == 0: target_model.copy_from(
episode_reward = 0 if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) if (np.mean(all_rewards[-10:], 0)[1] > currMax): print("Saving model...") torch.save(model.state_dict(), filename) currMax = np.mean(all_rewards[-10:], 0)[1] else: print("not saving") if frame_idx % 20000 == 0: print("Copying from model...") target_model.copy_from(model)
if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) times.append(frame_idx) value_losses.append(np.mean(losses, 0)[1]) value_all_rewards.append(np.mean(all_rewards[-10:], 0)[1]) if frame_idx % 50000 == 0: target_model.copy_from(model) # save losses and rewards to txt open('data_losses.txt', 'w').write(str(losses)) open('data_all_rewards.txt', 'w').write(str(all_rewards)) # draw two plots draw_plot() # save the trained model torch.save(model.state_dict(), "trained_model_revised.pth")
state = next_state # Change to next state episode_reward += reward # Keep adding rewards until goal state if done: # Goal state state = env.reset() # Restart game all_rewards.append((frame_idx, episode_reward)) # Store episode_reward w frame it ended episode_reward = 0 if len(replay_buffer) > replay_initial: # If enough frames in replay_buffer (10000) loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() # Resets gradient after every mini-batch loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0: if len(replay_buffer) <= replay_initial: # If frames still needed in replay_buffer print('#Frame: %d, preparing replay buffer' % frame_idx) else: # If enough frames in replay_buffer print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) plot_model(losses, all_rewards) if best_mean_reward < np.mean(all_rewards[-10:], 0)[1]: best_mean_reward = np.mean(all_rewards[-10:], 0)[1] torch.save(model.state_dict(), f"models/model_r={best_mean_reward}_f={frame_idx}.pth") if frame_idx % sync_models_at_frame == 0: target_model.copy_from(model) # Copy model's weights onto target after number of frames
batch_size = 32 gamma = 0.99 target_update = 50000 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 1000000 replay_initial = 10000 learning_rate = 1e-5 train_replay_buffer = ReplayBuffer(100000) analysis_replay_buffer = ReplayBuffer(100000) policy_model = QLearner(env, train_num_frames, batch_size, gamma, train_replay_buffer) target_model = QLearner(env, train_num_frames, batch_size, gamma, train_replay_buffer) target_model.load_state_dict(policy_model.state_dict()) target_model.eval() optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if USE_CUDA: policy_model = policy_model.to(device) target_model = target_model.to(device) epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) def play_to_train(num_frames, policy_model, target_model, buffer): losses = [] all_rewards = []
with open("rewards.txt", "w") as output: for row in all_rewards: output.write(str(row[0]) + "," + str((row[1])) + "\n") with open("losses.txt", "w") as output: for row in losses: output.write(str(row[0]) + "," + str((row[1])) + "\n") episode_reward = 0 if len(replay_buffer) > replay_initial: gamma1 = gamma * (10000 / frame_idx) loss = compute_td_loss(model, target_model, batch_size, gamma1, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) torch.save(model.state_dict(), "model2.pth") if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) torch.save(model.state_dict(), "model2.pth") if frame_idx % 50000 == 0: target_model.copy_from(model)
state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) torch.save(model.state_dict(),'my_trained.pth') # saving our model every 10,000 frames if frame_idx % 50000 == 0: target_model.copy_from(model) np.save('losses.npy',losses) #saving the losses and rewards after 2M frames np.save('rewards.npy',all_rewards)
replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.data.cpu().numpy()) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) #print(episode_reward) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses))) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:])) #print(episode_reward) plot(frame_idx, all_rewards, losses) torch.save(model.state_dict(), 'newdqnModel.pt') #torch.save(model, 'entiremodel.pt') #model.save_model('ndqnModel.tar',model)
loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) if frame_idx % 50000 == 0: target_model.copy_from(model) if frame_idx % 1000 == 0: #PART 4.1 print("frame: ", frame_idx, " - saving new model") torch.save(model.state_dict(), "new_model.pth") #PART 4.2 np.save('losses-itr.npy', losses) np.save('rewards-itr.npy', all_rewards) #PART 4.2 again if loop finishes. np.save('losses.npy', losses) np.save('rewards.npy', all_rewards) # data = np.load('losses.npy') # print(data)
# print(np.shape(next_state)) #print(reward) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) #print(episode_reward) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.data.cpu().numpy()) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses))) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:])) pkl.dump(losses, open("loss.pkl", "wb")) pkl.dump(all_rewards, open("reward.pkl", "wb")) torch.save(model.state_dict(), 'model_trained.pt')
state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 savetxt('rewards.csv', all_rewards, delimiter=',') savetxt('losses.csv', losses, delimiter=',') if len(replay_buffer) > replay_initial: # We write the loss function loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append((frame_idx, loss.data.cpu().numpy())) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses, 0)[1])) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) if frame_idx % 50000 == 0: target_model.copy_from(model) #update! torch.save(model.state_dict(), 'modelsave.pth')
episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > replay_initial: loss = compute_td_loss(model, batch_size, gamma, replay_buffer) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.data.cpu().numpy()) if frame_idx % 10000 == 0 and len(replay_buffer) <= replay_initial: print('#Frame: %d, preparing replay buffer' % frame_idx) if frame_idx % 10000 == 0 and len(replay_buffer) > replay_initial: print('#Frame: %d, Loss: %f' % (frame_idx, np.mean(losses))) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:])) loss_list.append(np.mean(losses)) reward_list.append(np.mean(all_rewards[-10:])) sio.savemat('Results.mat', { 'reward_list': reward_list, 'loss_list': loss_list }) torch.save(model.state_dict(), 'trained_model.pth')
env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) num_frames = 1000000 batch_size = 32 gamma = 0.99 replay_initial = 10000 replay_buffer = ReplayBuffer(100000) t_replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) target_model = QLearner(env, num_frames, batch_size, gamma, t_replay_buffer) target_model.load_state_dict(model.state_dict()) optimizer = optim.Adam(model.parameters(), lr=0.00001) if USE_CUDA: model = model.cuda() target_model = target_model.cuda() epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0
print('Frame: %d' % frame_idx) print('Loss: %f' % np.mean(losses, 0)[1]) print('Last-10 average reward: %f' % np.mean(all_rewards[-10:], 0)[1]) print('\n\n') reported_avg_reward = np.mean(all_rewards[-10:], 0)[1] progress = open('progress_lr0001_epsilon50k.txt', 'a') progress.write('Frame: %d\n' % (frame_idx)) progress.write('Loss: %f\n' % (np.mean(losses, 0)[1])) progress.write('Last-10 average reward: %f\n' % np.mean(all_rewards[-10:], 0)[1]) progress.write('Epsilon: %f\n' % epsilon) now = datetime.now() current_time = now.strftime("%H:%M:%S") progress.write('Time: ' + current_time) progress.write("\n\n") progress.close() # EVERY 50K FRAMES, UPDATE TARGET NETWORK WITH CURRENT EXMTL ONE if frame_idx % 50000 == 0: target_model.copy_from(model) # target_model.eval() file_out = file_name file_out = re.sub('\.pth', '', file_out) now = datetime.now() current_time = now.strftime("%H:%M:%S") file_out = file_out + "_" + current_time + "_" + "epsilon_" + str( epsilon) + "_frame_" + str(frame_idx) + ".pth" torch.save(model.state_dict(), file_out) # ESSENTIALLY, BACKPROPOGATE EVERY 10K FRAMES AND UPDATE THE COMPARISON MODEL AFTER 5 UPDATES