epsilon_decay = 30000 #used in ? epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0 state = env.reset() # initial state for frame_idx in range(1, num_frames + 1): # plays until player or model gets score 21 #print("Frame: " + str(frame_idx)) #uncomment to look at frames epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) #will write this function next_state, reward, done, _ = env.step(action) #get next state replay_buffer.push(state, action, reward, next_state, done) #push actions resutls to buffer state = next_state episode_reward += reward if done: # reset game and state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len( replay_buffer
losses = [] all_rewards = [] episode_reward = 0 state = env.reset() for frame_idx in range( 1, num_frames + 1 ): # QUESTION: Why is num_frames > replay_buffer capacity? replay_buffer should # overfill because num_frames is larger, so it will keep adding. Does it automatically expand when you push? I think it does expand, # using the numpy expand_dims funciton #print("Frame: " + str(frame_idx)) epsilon = epsilon_by_frame(frame_idx) # get the epsilon value action = model.act(state, epsilon) # This is where act function is used next_state, reward, done, _ = env.step( action) # look at next state to see if it gives us a reward replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward # if the game is over if done: state = env.reset() all_rewards.append( (frame_idx, episode_reward)) # record reward for that game episode_reward = 0 # reset
frame_list = random.sample(range(2000, num_frames), 8000) frame_list.sort() hiddenLayers = [] state_list = [] action_list = [] reward_frame_list = [] accumulated_reward = [] ##### frame_order = [] epsilon = -1 for frame_idx in range(1, num_frames + 1): action = model.act(state, epsilon) next_state, reward, done, _ = env.step(action) if (frame_idx in frame_list) or (frame_idx < 2000): hiddenTensor = model.get_hidden_layer(state) temp = hiddenTensor.data.cpu().numpy() hiddenLayers.append(temp[0]) #hiddenLayers.append(hiddenTensor.data.cpu().numpy()) #hiddenLayers = np.concatenate((hiddenLayers, hiddenTensor.data.cpu().numpy()), axis=0) state_list.append(state.squeeze(0)) action_list.append(action) reward_frame_list.append(reward) accumulated_reward.append(episode_reward) ##### frame_order.append(frame_idx) #env.env.ale.saveScreenPNG('test_image.png')
if USE_CUDA: model = model.cuda() print("Using cuda") model.load_state_dict(torch.load(pthname, map_location='cpu')) env.seed(1) state = env.reset() done = False games_won = 0 while not done: if use_gui: env.render() action = model.act(state, 0) state, reward, done, _ = env.step(action) if reward != 0: print(reward) if reward == 1: games_won += 1 print("Games Won: {}".format(games_won)) try: sys.exit(0) except: pass
model = model.cuda() target_model = target_model.cuda() print("Using cuda") # Neg exp func. Start exploring then exploiting according to frame_indx epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0 state = env.reset() # Initial state best_mean_reward = float('-inf') for frame_idx in range(starting_frame, num_frames + 1): # Each frame in # frames played epsilon = epsilon_by_frame(frame_idx) # Epsilon decreases as frames played action = model.act(state, epsilon) # if (rand < e) explore. Else action w max(Q-val). action: int next_state, reward, done, _ = env.step(action) # Get env info after taking action. next_state: 2d int. reward: float. done: bool. replay_buffer.push(state, action, reward, next_state, done) # Save state info onto buffer (note: every frame) state = next_state # Change to next state episode_reward += reward # Keep adding rewards until goal state if done: # Goal state state = env.reset() # Restart game all_rewards.append((frame_idx, episode_reward)) # Store episode_reward w frame it ended episode_reward = 0 if len(replay_buffer) > replay_initial: # If enough frames in replay_buffer (10000) loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer) optimizer.zero_grad() # Resets gradient after every mini-batch
epsilon_final = 0.01 # go towards this from above epsilon_decay = 30000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0 state = env.reset() for frame_idx in range(1, num_frames + 1): # play til 21 pts #print("Frame: " + str(frame_idx)) epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) # I write this next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: # game over, save state = env.reset() all_rewards.append((frame_idx, episode_reward)) episode_reward = 0 if len(replay_buffer) > replay_initial: # go past replay buffer loss = compute_td_loss( model, target_model, batch_size, gamma, replay_buffer