示例#1
0
        for i in range(3):
            next_frame, _, _, _ = env.step(0)  # Take an arbitrary action
            frame_list.append(transform(next_frame))

        current_state = torch.cat(frame_list, dim=0).to(
            device)  # Stack the images. Note that image shape is (N, C, H, W)

        # Obtain action, log probability, and value estimate for the initial state
        # Move the outputs to cpu to save memory
        action, log_prob, ex_val = actor_critic(current_state.unsqueeze(dim=0))
        action = action.squeeze().cpu()
        log_prob = log_prob.squeeze().cpu()
        ex_val = ex_val.squeeze().cpu()

        # Store the first state and value estimate in memory
        memory.set_initial_state(current_state.clone().detach().cpu(),
                                 initial_ex_val_est=ex_val)

        for t in count():

            # Interact with the environment
            next_frame, reward, done, _ = env.step(action.item())
            running_reward += reward

            # Pop the frame from the top of the list and append the new frame, and stack to form the current state
            frame_list.pop(0)
            frame_list.append(transform(next_frame))
            next_state = torch.cat(frame_list,
                                   dim=0).to(device)  # Stack the images

            # Obtain action, log probability and value estimate for the next state in a single propagation
            # Move the outputs to cpu to save memory
示例#2
0
        # TODO: Change codes below

        # Estimate the value of the initial state
        ex_val = value_net_ex(
            torch.tensor([current_state], dtype=torch.float32,
                         device=device)).squeeze()  # squeeze the dimension
        in_val = value_net_in(
            torch.tensor(
                [np.concatenate((current_state, [i_episode]), axis=0)],
                dtype=torch.float32,
                device=device)).squeeze(
                )  # provide i_episode as additional info as input

        # Store the first state and value estimate in memory
        memory.set_initial_state(current_state,
                                 initial_ex_val_est=ex_val,
                                 initial_in_val_est=in_val)

        # Obtain current state hash code
        current_state_hash = simhash.hash(current_state)

        for t in count():

            # Sample an action given the current state
            action, log_prob = policy_net(
                torch.tensor([current_state],
                             dtype=torch.float32,
                             device=device))
            log_prob = log_prob.squeeze()

            # Interact with the environment
            load_checkpoint(ckpt_dir, i_epoch, layer_sizes, input_size, device=device)

    # To record episode stats
    episode_durations = []
    episode_rewards = []

    for i_episode in range(batch_size):

        # Keep track of the running reward
        running_reward = 0

        # Initialize the environment and state
        current_state = env.reset()

        # Store the first state and value estimate in memory
        memory.set_initial_state(current_state)

        for t in count():
            # Make sure that policy net and value net is in training mode
            policy_net.train()

            # Sample an action given the current state
            action, log_prob = policy_net(
                torch.tensor([current_state], device=device))
            log_prob = log_prob.squeeze()

            # Interact with the environment
            next_state, reward, done, _ = env.step(action.item())
            running_reward += reward

            # Render this episode