for i in range(3): next_frame, _, _, _ = env.step(0) # Take an arbitrary action frame_list.append(transform(next_frame)) current_state = torch.cat(frame_list, dim=0).to( device) # Stack the images. Note that image shape is (N, C, H, W) # Obtain action, log probability, and value estimate for the initial state # Move the outputs to cpu to save memory action, log_prob, ex_val = actor_critic(current_state.unsqueeze(dim=0)) action = action.squeeze().cpu() log_prob = log_prob.squeeze().cpu() ex_val = ex_val.squeeze().cpu() # Store the first state and value estimate in memory memory.set_initial_state(current_state.clone().detach().cpu(), initial_ex_val_est=ex_val) for t in count(): # Interact with the environment next_frame, reward, done, _ = env.step(action.item()) running_reward += reward # Pop the frame from the top of the list and append the new frame, and stack to form the current state frame_list.pop(0) frame_list.append(transform(next_frame)) next_state = torch.cat(frame_list, dim=0).to(device) # Stack the images # Obtain action, log probability and value estimate for the next state in a single propagation # Move the outputs to cpu to save memory
# TODO: Change codes below # Estimate the value of the initial state ex_val = value_net_ex( torch.tensor([current_state], dtype=torch.float32, device=device)).squeeze() # squeeze the dimension in_val = value_net_in( torch.tensor( [np.concatenate((current_state, [i_episode]), axis=0)], dtype=torch.float32, device=device)).squeeze( ) # provide i_episode as additional info as input # Store the first state and value estimate in memory memory.set_initial_state(current_state, initial_ex_val_est=ex_val, initial_in_val_est=in_val) # Obtain current state hash code current_state_hash = simhash.hash(current_state) for t in count(): # Sample an action given the current state action, log_prob = policy_net( torch.tensor([current_state], dtype=torch.float32, device=device)) log_prob = log_prob.squeeze() # Interact with the environment
load_checkpoint(ckpt_dir, i_epoch, layer_sizes, input_size, device=device) # To record episode stats episode_durations = [] episode_rewards = [] for i_episode in range(batch_size): # Keep track of the running reward running_reward = 0 # Initialize the environment and state current_state = env.reset() # Store the first state and value estimate in memory memory.set_initial_state(current_state) for t in count(): # Make sure that policy net and value net is in training mode policy_net.train() # Sample an action given the current state action, log_prob = policy_net( torch.tensor([current_state], device=device)) log_prob = log_prob.squeeze() # Interact with the environment next_state, reward, done, _ = env.step(action.item()) running_reward += reward # Render this episode