示例#1
0
def generate_demo_video():
    env = StarIntrudersEnvironment(screen_size=512)
    state = env.reset()
    done = False
    print('Playing sample game with environment {}'.format(env))
    filename = 'game-StarIntruders-{}.mp4'.format(int(time.time()))
    vid = imutil.Video(filename, framerate=8)

    rewards = []
    action = env.action_space.sample()
    for t in range(200):
        if done:
            print('Finished episode with {} total reward after {} timesteps'.format(sum(rewards), len(rewards)))
            rewards = []
            state = env.reset()
        if np.random.random() < .2:
            action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        minimap_ftr, screen_ftr, minimap_rgb, screen_rgb = state
        caption = 't={} Reward: {:.2f}'.format(t, reward)
        vid(screen_rgb, normalize=False, caption=caption, resize_to=(512,512))
        rewards.append(reward)
        print('Timestep t={} took action {} got reward {}'.format(len(rewards), action, reward))
    vid.finish()
    print('Finished episode with {} total reward after {} timesteps'.format(sum(rewards), len(rewards)))
示例#2
0
 def __init__(self,
              sc2_env,
              game_number,
              env_name,
              action_names,
              tensor_reward_key,
              replay_dimension=256):
     time_string = "{}".format(int(time.time()))
     self.json_pathname = os.path.join(
         REPLAY_DIR_PATH,
         "game_" + time_string + "_" + str(replay_dimension) + ".json")
     self.video_pathname = os.path.join(
         REPLAY_DIR_PATH,
         "game_" + time_string + "_" + str(replay_dimension) + ".mp4")
     self.saliency_pathname = os.path.join(
         REPLAY_DIR_PATH,
         "game_" + time_string + "_" + str(replay_dimension) + ".expl")
     self.sc2_env = sc2_env
     self.game_clock_tick = 0
     self.frames = []
     self.action_names = action_names
     self.video = imutil.Video(filename=self.video_pathname)
     self.decision_point_number = 1
     self.tensor_reward_key = tensor_reward_key
     self.explanation_points_array = []
示例#3
0
文件: main.py 项目: rahamor/scm-gan
def generate_trajectory_video(datasource):
    print("Writing example video of datasource {} to file".format(datasource))
    filename = 'example_trajectory.mp4'
    vid = imutil.Video(filename, framerate=10)
    states, rewards, dones, infos = datasource.get_trajectories(batch_size=1)
    for state in states[0]:
        img = state.transpose(1, 2, 0)
        vid.write_frame(img, resize_to=(256, 256))
    vid.finish()
示例#4
0
 def __init__(self, render=True, video_filename=None, verbose=False, num_players=2):
     if video_filename:
         render = True
     self.render = render
     self.num_players = num_players
     self.sc2env = make_sc2env(num_players, render=render)
     self.video = None
     if video_filename:
         self.video = imutil.Video(filename=video_filename)
     self.verbose = verbose
     self.action_space = self.get_action_space()
示例#5
0
    def __init__(self, sc2_env, game_number, env_name, action_component_names, replay_dimension = 256):
        time_string = "{}".format(int(time.time()))

        self.game_number = game_number
        self.json_pathname = os.path.join(REPLAY_DIR_PATH,"game_" + str(self.game_number) + "_" +  time_string + "_" + str(replay_dimension) + ".json")
        self.video_pathname = os.path.join(REPLAY_DIR_PATH,"game_" + str(self.game_number) + "_" +  time_string + "_" + str(replay_dimension) + ".mp4")
        self.saliency_pathname = os.path.join(REPLAY_DIR_PATH,"game_" + str(self.game_number) + "_" +  time_string + "_" + str(replay_dimension) + ".expl")

        self.sc2_env = sc2_env
        #self.game_clock_tick = 0
        self.frames = [] 
        self.action_component_names = action_component_names
        self.video = imutil.Video(filename=self.video_pathname, framerate=25)
        self.decision_point_number = 1
        self.explanation_points_array = []
        self.current_wave_number = 0
        self.jpg_number = 0
示例#6
0
文件: main.py 项目: rahamor/scm-gan
def visualize_forward_simulation(datasource,
                                 encoder,
                                 decoder,
                                 transition,
                                 reward_pred,
                                 train_iter=0,
                                 timesteps=60,
                                 num_factors=16):
    start_time = time.time()
    print('Starting trajectory simulation for {} frames'.format(timesteps))
    states, rewards, dones, actions = datasource.get_trajectories(
        batch_size=1, timesteps=timesteps, random_start=False)
    states = torch.Tensor(states).cuda()
    num_actions = datasource.binary_input_channels
    num_rewards = datasource.scalar_output_channels
    # rgb_states = torch.Tensor(rgb_states.transpose(0, 1, 4, 2, 3)).cuda()
    # We begin *at* state t=2, then we simulate from t=2 until t=timesteps
    # Encoder input is t=0, t=1, t=2 to produce t=1
    z = encoder(states[:, :3])
    z = transition(z, torch.eye(num_actions)[actions[:, 1]].cuda())
    z.detach()

    ftr_vid = imutil.Video('simulation_ftr_iter_{:06d}.mp4'.format(train_iter),
                           framerate=3)

    # First: replay in simulation the true trajectory
    caption = 'Real'
    simulate_trajectory_from_actions(z.clone(),
                                     decoder,
                                     reward_pred,
                                     transition,
                                     states,
                                     rewards,
                                     dones,
                                     actions,
                                     ftr_vid,
                                     caption_tag=caption,
                                     num_rewards=num_rewards,
                                     num_actions=num_actions)

    ftr_vid.finish()
    print('Finished trajectory simulation in {:.02f}s'.format(time.time() -
                                                              start_time))
示例#7
0
def generate_demo_video():
    env = ZerglingDefenseEnvironment()
    state = env.reset()
    done = False
    print('Playing sample game with environment {}'.format(env))
    filename = 'game-ZerglingDefense-{}.mp4'.format(int(time.time()))
    vid = imutil.Video(filename, framerate=8)

    # This function will run on each *rendered frame* of the game, including
    # frames in-between the agent's actions
    def video_write_frame(state, reward, done, info):
        minimap_ftr, screen_ftr, minimap_rgb, screen_rgb = state
        vid(screen_rgb, normalize=False)

    rewards = []
    for i in range(100):
        if done:
            break
        action = env.action_space.sample()
        state, reward, done, info = env.step(
            action, animation_callback=video_write_frame)
        video_write_frame(state, reward, done, info)

        imutil.show(state[1],
                    img_padding=8,
                    filename='ftr_{:06d}.png'.format(i),
                    resize_to=(600, 800))
        imutil.show(state[1][10],
                    filename='ftr10_{:06d}.png'.format(i),
                    resize_to=(512, 512))
        imutil.show(state[3],
                    filename='rgb_{:06d}.png'.format(i),
                    resize_to=(512, 512))
        rewards.append(reward)
        print('Timestep t={} took action {} got reward {}'.format(
            len(rewards), action, reward))
    vid.finish()
    print('Finished episode with {} total reward after {} timesteps'.format(
        sum(rewards), len(rewards)))
示例#8
0
文件: recorder.py 项目: khlam/sc2env
 def __init__(self,
              sc2_env,
              game_number,
              env_name,
              tensor_action_key,
              tensor_reward_key,
              replay_dimension=256):
     #self.json_filename = env_name + "_" +  str(game_number) + ".json"
     #self.video_filename = env_name + "_" +  str(game_number) + ".mp4"
     time_string = "{}".format(int(time.time()))
     self.json_filename = "game_" + time_string + "_" + str(
         replay_dimension) + ".json"
     self.video_filename = "game_" + time_string + "_" + str(
         replay_dimension) + ".mp4"
     self.sc2_env = sc2_env
     self.game_clock_tick = 0
     self.frames = []
     self.action_names = [
         'Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'
     ]
     self.video = imutil.Video(filename=self.video_filename)
     self.decision_point_number = 1
     self.tensor_action_key = tensor_action_key
     self.tensor_reward_key = tensor_reward_key
示例#9
0
def play_episode(env, agent, episode_num=0, video=False):
    start_time = time.time()
    print('Starting episode {}...'.format(episode_num))
    state = env.reset()
    done = False
    cumulative_reward = 0
    if video:
        vid = imutil.Video('training_episode_{:04d}.mp4'.format(episode_num))
    for t in range(MAX_STEPS):
        if done:
            break
        action = agent.step(state)
        state, reward, done, info = env.step(action)
        caption = 't={} reward={}'.format(t, reward)
        if video:
            vid.write_frame(state[3], normalize=False)
        agent.update(reward)
        cumulative_reward += reward
    if video:
        vid.finish()
    print('Finished episode ({} actions) in {:.3f} sec total reward {}'.format(
        t,
        time.time() - start_time, cumulative_reward))
    return cumulative_reward
示例#10
0
        t_states.append(new_states)
        for i in range(batch_size):
            states[i] = new_states[i]
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    s, r, d, i = [np.swapaxes(t, 0, 1) for t in (t_states, t_rewards, t_dones, t_actions)]
    return s, r, d, i



if __name__ == '__main__':
    import imutil
    batches = 10
    timesteps = 100
    batch_size = 1
    print('Simulation time benchmark: Centipede')
    vid = imutil.Video('centipede.mp4', framerate=5)
    start_time = time.time()
    for _ in range(batches):
        print('Simulating {} timesteps batch size {}...'.format(timesteps, batch_size))
        states, rewards, dones, actions = get_trajectories(batch_size, timesteps=timesteps)
        for state, action, reward in zip(states[0], actions[0], rewards[0]):
            caption = "Prev. Action {} Prev Reward {}".format(action, reward)
            vid.write_frame(state.transpose(1,2,0), img_padding=8, resize_to=(512,512), caption=caption)
    duration = time.time() - start_time
    print('Finished simulating {} games for {} timesteps in {:.3f} sec'.format(
        MAX_BATCH_SIZE, timesteps*batches, duration))
    vid.finish()
示例#11
0
def visualize_bptt(z,
                   transition,
                   reward_predictor,
                   decoder,
                   rgb_decoder,
                   num_actions,
                   vid=None):
    z.retain_grad()
    actions = []
    zees = []
    if vid is None:
        vid = imutil.Video(filename='excitation_bptt_{}.mp4'.format(
            int(time.time())),
                           framerate=10)
    for t in range(30):
        a = onehot(1) if t == 0 else onehot(3)
        a.requires_grad = True
        a.retain_grad()

        actions.append(a)  # Keep track of previous actions

        z = transition(z, a)
        z.retain_grad()
        zees.append(z)

        r, rmap = reward_predictor(z, visualize=True)
        r.retain_grad()

        caption = 'Neural Simulation: expected r = {:.2f} {:.2f}'.format(
            r[0, 0], r[0, 1])
        vid.write_frame(rgb_decoder(decoder(z))[0],
                        resize_to=(512, 512),
                        caption=caption)
        rewards = rmap[0].sum(dim=0)
        rewards = torch.clamp(rewards * 128 + 128, 0, 255)
        #imutil.show(rewards, resize_to=(256, 256), normalize=False, save=False)
        if r.sum().abs() > 0.8:
            print('Expected reward of {:.2f} at time t+{}'.format(r.sum(), t))
            for _ in range(20):
                vid.write_frame(rgb_decoder(decoder(z))[0],
                                resize_to=(512, 512),
                                caption=caption)
            localized_expected_reward = (rmap *
                                         (rmap.abs() == rmap.abs().max()).type(
                                             torch.cuda.FloatTensor)).sum()
            localized_expected_reward.backward(retain_graph=True)
            print([at * at.grad for at in actions])
            '''
            for z in zees[::-1] + zees:
                caption = 'Plan for reward R={:.2f} at time t+{}'.format(r.sum(), t)
                mask = (z.grad.abs() / (.001 + z.grad.abs().max())) ** 0.5
                img = rgb_decoder(decoder(z * mask))[0]
                for _ in range(4):
                    vid.write_frame(img, resize_to=(512,512), img_padding=8, caption=caption)
            '''
            for z in zees[::-1]:
                caption = 'Causal Backtrack, reward R={:.2f} at time t+{}'.format(
                    r.sum(), t)
                mask = (z.grad.abs() / (.001 + z.grad.abs().max()))
                img1 = decoder(z * mask)[0].sum(dim=0)
                for _ in range(4):
                    vid.write_frame(img1,
                                    resize_to=(512, 512),
                                    img_padding=8,
                                    caption=caption)
            break
    return True
示例#12
0
def get_trajectories(batch_size=32, timesteps=10, policy='random', random_start=False, training=False):
    envs = MultiEnvironment([Env() for _ in range(batch_size)])
    t_states, t_rewards, t_dones, t_actions = [], [], [], []
    # Initial actions/stats
    actions = np.random.randint(envs.action_space.n, size=(batch_size,))
    for t in range(timesteps):
        states, rewards, dones, _ = envs.step(actions)
        rewards = [rewards]
        actions = np.random.randint(envs.action_space.n, size=(batch_size,))
        t_states.append(states)
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    states = np.swapaxes(t_states, 0, 1)
    rewards = np.swapaxes(t_rewards, 0, 1)
    dones = np.swapaxes(t_dones, 0, 1)
    actions = np.swapaxes(t_actions, 0, 1)
    return states, rewards, dones, actions


if __name__ == '__main__':
    states, rewards, dones, actions = get_trajectories(batch_size=1, timesteps=100)
    import imutil
    vid = imutil.Video('gameoflife.mp4', framerate=5)
    for state, action, reward in zip(states[0], actions[0], rewards[0]):
        pixels = np.transpose(state, (1, 2, 0))
        caption = "Prev. Action {} Prev Reward {}".format(action, reward)
        vid.write_frame(pixels, img_padding=8, resize_to=(512,512), caption=caption)
    vid.finish()
示例#13
0
        if not ret:
            break
        yield frame[:,:,::-1]





fig = plt.figure(figsize=(6.4, 6.4))
ax = fig.add_subplot(111, projection='3d')
ax.view_init(10, 0)
fig.tight_layout(rect=[0, 0.01, 1, 0.99])

fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D)

vid = imutil.Video('output_{}.mp4'.format(int(time.time())))

for img in read_images():
    start_time = time.time()
    preds = fa.get_landmarks(img)[0]
    print('Timing: {:.02f} seconds for one frame'.format(time.time() - start_time))
    import pdb; pdb.set_trace()
    #ax.set_xlim3d(-500, 500)
    #ax.set_ylim3d(-500, 500)
    #ax.set_zlim3d(-200, 200)
    ax.scatter(preds[:, 2], preds[:, 0], -preds[:, 1])
    left = imutil.get_pixels(img, 640, 640)
    right = imutil.get_pixels(plt, 640, 640)
    pixels = np.concatenate([left, right], axis=1)
    vid.write_frame(pixels)
    imutil.show(pixels, save=False)
示例#14
0
def generate_images_batched(latents, max_batch_size=16):
    i = 0
    while i < len(latents):
        for img in generate_images(latents[i:i+max_batch_size]):
            yield img
        i += max_batch_size


# Generate latent vectors.
latents = np.random.RandomState(1000).randn(1000, *Gs.input_shapes[0][1:]) # 1000 random latents
latents = latents[[477, 56, 83, 887, 583, 391, 86, 340, 341, 415]] # hand-picked top-10
images = generate_images(latents)
# Save images as PNG.
for idx in range(images.shape[0]):
    PIL.Image.fromarray(images[idx], 'RGB').save('img%d.jpg' % idx)


# Generate latent videos
latent_start = latents[6]
latent_end = latents[9]
FRAMES = 120
latent_interp = []
for i in range(FRAMES):
    theta = i / FRAMES
    latent_interp.append(theta * latent_start + (1 - theta) * latent_end)

vid = imutil.Video('interpolated_face.mp4')
for img in generate_images_batched(np.array(latent_interp)):
    vid.write_frame(img)
vid.finish()
示例#15
0
        if policy == 'random':
            actions = np.random.randint(envs.action_space.n,
                                        size=(batch_size, ))
        if policy == 'repeat':
            actions = [i % envs.action_space.n for i in range(batch_size)]
        t_states.append(states)
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    states = np.swapaxes(t_states, 0, 1)
    rewards = np.swapaxes(t_rewards, 0, 1)
    dones = np.swapaxes(t_dones, 0, 1)
    actions = np.swapaxes(t_actions, 0, 1)
    return states, rewards, dones, actions


if __name__ == '__main__':
    states, rewards, dones, actions = get_trajectories(batch_size=1,
                                                       timesteps=100)
    import imutil
    vid = imutil.Video('gridworld.mp4', framerate=5)
    for state, action, reward in zip(states[0], actions[0], rewards[0]):
        pixels = np.transpose(state, (1, 2, 0))
        caption = "Prev. Action {} Prev Reward {}".format(action, reward)
        vid.write_frame(pixels,
                        img_padding=8,
                        resize_to=(512, 512),
                        caption=caption)
    vid.finish()
示例#16
0
            actions = np.random.randint(envs.action_space.n,
                                        size=(batch_size, ))
        if policy == 'repeat':
            actions = [i % envs.action_space.n for i in range(batch_size)]
        states, rewards, dones, _ = envs.step(actions)
        t_states.append(states)
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    states = np.swapaxes(t_states, 0, 1)
    rewards = np.swapaxes(t_rewards, 0, 1)
    dones = np.swapaxes(t_dones, 0, 1)
    actions = np.swapaxes(t_actions, 0, 1)
    return states, rewards, dones, actions


if __name__ == '__main__':
    states, rewards, dones, actions = get_trajectories(batch_size=1,
                                                       timesteps=200)
    import imutil
    vid = imutil.Video('roomba1.mp4', framerate=10)
    for state, action, reward in zip(states[0], actions[0], rewards[0]):
        pixels = np.transpose(state, (1, 2, 0))
        caption = "Action {} Reward {}".format(action, reward)
        vid.write_frame(pixels,
                        img_padding=8,
                        resize_to=(512, 512),
                        caption=caption)
    vid.finish()
示例#17
0
            actions = np.random.randint(envs.action_space.n,
                                        size=(batch_size, ))
        if policy == 'repeat':
            actions = [i % envs.action_space.n for i in range(batch_size)]
        states, rewards, dones, _ = envs.step(actions)
        t_states.append(states)
        t_rewards.append(rewards)
        t_dones.append(dones)
        t_actions.append(actions)
    # Reshape to (batch_size, timesteps, ...)
    states = np.swapaxes(t_states, 0, 1)
    rewards = np.swapaxes(t_rewards, 0, 1)
    dones = np.swapaxes(t_dones, 0, 1)
    actions = np.swapaxes(t_actions, 0, 1)
    return states, rewards, dones, actions


if __name__ == '__main__':
    states, rewards, dones, actions = get_trajectories(batch_size=1,
                                                       timesteps=100)
    import imutil
    vid = imutil.Video('realpong.mp4', framerate=5)
    for state, action, reward in zip(states[0], actions[0], rewards[0]):
        pixels = np.transpose(state, (1, 2, 0))
        caption = "Prev. Action {} Prev Reward {}".format(action, reward)
        vid.write_frame(pixels,
                        img_padding=8,
                        resize_to=(512, 512),
                        caption=caption)
    vid.finish()
示例#18
0
文件: main.py 项目: rahamor/scm-gan
def visualize_reconstruction(datasource,
                             encoder,
                             decoder,
                             transition,
                             reward_predictor,
                             train_iter=0):
    num_actions = datasource.binary_input_channels
    num_rewards = datasource.scalar_output_channels
    timesteps = 45
    batch_size = 1
    states, rewards, dones, actions = datasource.get_trajectories(
        batch_size, timesteps, random_start=False)
    states = torch.Tensor(states).cuda()
    rewards = torch.Tensor(rewards).cuda()
    actions = torch.LongTensor(actions).cuda()
    offsets = [1, 3]
    print('Generating videos for offsets {}'.format(offsets))
    for offset in offsets:
        vid_rgb = imutil.Video('prediction_{:02}_iter_{:06d}.mp4'.format(
            offset, train_iter),
                               framerate=3)
        #vid_aleatoric = imutil.Video('anomaly_detection_{:02}_iter_{:06d}.mp4'.format(offset, train_iter), framerate=3)
        vid_reward = imutil.Video(
            'reward_prediction_{:02}_iter_{:06d}.mp4'.format(
                offset, train_iter),
            framerate=3)
        for t in range(3, timesteps - offset):
            # Encode frames t-2, t-1, t to produce state at t-1
            # Then step forward once to produce state at t
            z = encoder(states[:, t - 2:t + 1])
            z = transition(z, torch.eye(num_actions)[actions[:, t - 1]].cuda())

            # Now step forward *offset* times to produce state at t+offset
            for t_i in range(t, t + offset):
                onehot_a = torch.eye(num_actions)[actions[:, t_i]].cuda()
                z = transition(z, onehot_a)

            # Our prediction of the world from 'offset' steps back
            predicted_features = decoder(z)
            predicted_features = torch.sigmoid(predicted_features)
            predicted_rgb = predicted_features
            predicted_reward, reward_map = reward_predictor(z, visualize=True)

            # The ground truth
            actual_features = states[:, t + offset]
            actual_rgb = convert_ndim_image_to_rgb(actual_features)

            # Difference between actual and predicted outcomes is "surprise"
            surprise_map = torch.clamp(
                (actual_features - predicted_features)**2, 0, 1)

            #caption = "t={} surprise (aleatoric): {:.03f}".format(t, surprise_map.sum())
            #pixels = composite_aleatoric_surprise_image(actual_rgb, surprise_map, z)
            #vid_aleatoric.write_frame(pixels, normalize=False, img_padding=8, caption=caption)

            caption = "Left: True t={} Right: Predicted t+{}, Pred. R: {}".format(
                t, offset, format_reward_vector(predicted_reward[0]))
            pixels = composite_feature_rgb_image(actual_features, actual_rgb,
                                                 predicted_features,
                                                 predicted_rgb)
            vid_rgb.write_frame(pixels,
                                normalize=False,
                                img_padding=8,
                                caption=caption)

            caption = "t={} fwd={}, Pred. R: {}".format(
                t, offset, format_reward_vector(predicted_reward[0]))
            reward_pixels = composite_rgb_reward_factor_image(
                predicted_rgb, reward_map, z, num_rewards=num_rewards)
            vid_reward.write_frame(reward_pixels,
                                   normalize=False,
                                   caption=caption)

        vid_rgb.finish()
        #vid_aleatoric.finish()
        vid_reward.finish()
    print('Finished generating forward-prediction videos')
示例#19
0
        dones_batch), np.array(actions_batch)


def convert_frame(state):
    return state.transpose((2, 0, 1)).copy()


if __name__ == '__main__':
    start_time = time.time()

    env = make_env()
    simulate_to_replay_buffer(1)

    env = make_env()
    batch_size = 8
    vid = imutil.Video('minipacman.mp4', framerate=5)
    states, rewards, dones, actions = get_trajectories(batch_size,
                                                       random_start=False,
                                                       timesteps=100)
    i = 0
    for state, reward, done, action in zip(states[0], rewards[0], dones[0],
                                           actions[0]):
        caption = "t={} Prev. Action {} Prev Reward {} Done {}".format(
            i, action, reward, done)
        vid.write_frame(state.transpose(1, 2, 0),
                        img_padding=8,
                        resize_to=(512, 512),
                        caption=caption)
        print('state {}, {}'.format(state.mean(), caption))
        i += 1
    duration = time.time() - start_time
示例#20
0
文件: main.py 项目: rahamor/scm-gan
def play(latent_dim, datasource, num_actions, num_rewards, encoder, decoder,
         reward_predictor, discriminator, transition):

    # Initialize environment
    env = datasource.make_env(screen_size=512)

    # No-op through the first 3 frames for initial state estimation
    state = env.reset()
    no_op = 3
    s_0, _ = datasource.convert_frame(state)
    state, reward, done, info = env.step(no_op)
    s_1, _ = datasource.convert_frame(state)
    state, reward, done, info = env.step(no_op)
    s_2, _ = datasource.convert_frame(state)
    state_list = [s_0, s_1, s_2]

    # Estimate initial state (given t=0,1,2 estimate state at t=2)
    states = torch.Tensor(state_list).cuda().unsqueeze(0)
    z = encoder(states)
    z = transition(z, onehot(no_op, num_actions))

    cumulative_reward = 0
    filename = 'SimpleRolloutAgent-{}.mp4'.format(int(time.time()))
    vid = imutil.Video(filename, framerate=10)
    t = 2
    cumulative_negative_reward = 0
    cumulative_positive_reward = 0
    while not done:
        z = z.detach()
        # In simulation, compute all possible futures to select the best action
        rewards = []
        for a in range(num_actions):
            z_a = transition(z, onehot(a, num_actions))
            # Look ahead three steps, using 3-step lookahead
            r_a = compute_rollout_reward(z_a,
                                         transition,
                                         reward_predictor,
                                         num_actions,
                                         a,
                                         rollout_depth=12,
                                         rollout_policy='noop')
            rewards.append(r_a)
            #print('Expected reward from taking action {} is {:.03f}'.format(a, r_a))
        max_r = max(rewards)
        max_a = int(np.argmax(rewards))

        # Take the best action, in real life
        new_state, new_reward, done, info = env.step(max_a)

        if len(info) > 1:
            positive_reward = sum(v for v in info.values() if v > 0)
            negative_reward = sum(v for v in info.values() if v < 0)
        else:
            positive_reward = max(0, new_reward)
            negative_reward = min(0, new_reward)

        cumulative_positive_reward += positive_reward
        cumulative_negative_reward -= negative_reward
        cumulative_reward += new_reward

        # Re-estimate state
        ftr_state, rgb_state = datasource.convert_frame(new_state)
        print('t={} curr. r={:.02f} future r: {:.02f} {:.02f} {:.02f} {:.02f}'.
              format(t, cumulative_reward, rewards[0], rewards[1], rewards[2],
                     rewards[3]))
        caption = 'Negative Reward: {}    Positive Reward: {}'.format(
            int(cumulative_negative_reward), int(cumulative_positive_reward))
        print(caption)
        vid.write_frame(rgb_state, resize_to=(512, 512), caption=caption)

        state_list = state_list[1:] + [ftr_state]
        z = encoder(torch.Tensor(state_list).cuda().unsqueeze(0))
        z = transition(z, onehot(max_a, num_actions))
        t += 1
        if t > 300:
            print('Ending evaluation due to time limit')
            break
    vid.finish()
    msg = 'Finished at t={} with cumulative reward {}'.format(
        t, cumulative_reward)
    with open('evaluation_metrics_{}.txt'.format(int(time.time())), 'w') as fp:
        fp.write(msg + '\n')
    print(msg)