예제 #1
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, "a2c")
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".log"
    log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventlog"
    log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    global envs
    es = [
        make_env(i, args.config_path, visual=args.visual, bots=args.bots)
        for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, log_file_name + ".pt"))
        filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0]
        if args.roe:
            e
            # TODO: Load event buffer
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        if not args.debug:
            try:
                os.makedirs(args.log_dir)
            except OSError:
                files = glob.glob(os.path.join(args.log_dir, log_file_name))
                for f in files:
                    os.remove(f)
                with open(log_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_reward_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_reward_file_name, "w") as myfile:
                    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    # Create event buffer
    if args.qd:
        event_buffer = EventBufferSQLProxy(args.num_events, args.capacity,
                                           args.exp_id, args.agent_id)
    elif not args.resume:
        event_buffer = EventBuffer(args.num_events, args.capacity)
    else:
        event_buffer = pickle.load(
            open(log_file_name + "_event_buffer_temp.p", "rb"))

    event_episode_rewards = []

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    intrinsic_reward.append(event_buffer.intrinsic_reward(e))
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            event_rewards = []
            for ei in range(0, args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    event_buffer.record_events(np.copy(
                        final_events[i].numpy()),
                                               frame=j * args.num_steps)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        final_episode_reward = np.mean(event_episode_rewards, axis=0)
        event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if final_rewards.mean() > best_final_rewards and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_final_rewards = final_rewards.mean()
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(
                save_model,
                os.path.join(save_path,
                             log_file_name.split(".log")[0] + ".pt"))

        if j % args.save_interval == 0 and args.save_dir != "" and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, log_file_name + "_temp.pt"))
            if isinstance(event_buffer, EventBuffer):
                pickle.dump(event_buffer,
                            open(log_file_name + "_event_buffer_temp.p", "wb"))

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )
            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())
            log_to_event_file = ','.join(
                map(str,
                    event_buffer.get_event_mean().tolist())) + "\n"
            log_to_event_reward_file = ','.join(
                map(str,
                    event_buffer.get_event_rewards().tolist())) + "\n"
            print(log)
            print(log_to_event_file)

            # Save to files
            with open(log_file_name, "a") as myfile:
                myfile.write(log_to_file)
            with open(log_event_file_name, "a") as myfile:
                myfile.write(str(total_num_steps) + "," + log_to_event_file)
            with open(log_event_reward_file_name, "a") as myfile:
                myfile.write(
                    str(total_num_steps) + "," + log_to_event_reward_file)

    envs.close()
    time.sleep(5)
예제 #2
0
def main():
    print("###############################################################")
    print("#################### VISDOOM LEARNER START ####################")
    print("###############################################################")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    global envs
    envs = VecEnv(
        [make_env(i, args.config_path) for i in range(args.num_processes)],
        logging=True,
        log_dir=args.log_dir)

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.algo == 'a2c' or args.algo == 'acktr':
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)
    elif args.algo == 'a2t':
        source_models = []
        files = glob.glob(os.path.join(args.source_models_path, '*.pt'))
        for file in files:
            print(file, 'loading model...')
            source_models.append(torch.load(file))
        actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape,
                                 source_models)
    elif args.algo == 'resnet':
        # args.num_stack = 3
        actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c' or args.algo == 'resnet':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'a2t':
        a2t_params = [p for p in actor_critic.parameters() if p.requires_grad]
        optimizer = optim.RMSprop(a2t_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # print ('Actions:', cpu_actions, 'Rewards:', reward)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c' or args.algo == 'resnet':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            elif args.algo == 'a2t':
                nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm)

            optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo)
            except IOError:
                pass
    envs.close()
    time.sleep(5)
예제 #3
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, str(args.exp_id))
    log_path = os.path.join(args.log_dir, str(args.exp_id))
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.exp_id) + "_" + str(args.agent_id) + ".log"
    #log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventlog"
    #log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    cig = "cig" in args.config_path
    global envs
    es = [
        make_env(i, args.config_path, visual=args.visual, cig=cig)
        for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, f"{args.agent_id}.pt"))
        filename = glob.glob(os.path.join(log_path, log_file_name))[0]
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        try:
            os.makedirs(save_path)
        except OSError:
            pass
        try:
            os.makedirs(log_path)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, log_file_name))
            for f in files:
                os.remove(f)
            #with open(log_file_name, "w") as myfile:
            #    myfile.write("")
            #files = glob.glob(os.path.join(args.log_dir, log_event_file_name))
            #for f in files:
            #    os.remove(f)
            #with open(log_event_file_name, "w") as myfile:
            #    myfile.write("")
            #files = glob.glob(os.path.join(args.log_dir, log_event_reward_file_name))
            #for f in files:
            #    os.remove(f)
            #with open(log_event_reward_file_name, "w") as myfile:
            #    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    def mean_distance_to_nearest_neighbor(elite_events):
        d = []
        nearest = None
        for a in range(len(elite_events)):
            for b in range(len(elite_events)):
                if a != b:
                    elite_a = elite_events[a]
                    elite_b = elite_events[b]
                    dist = np.linalg.norm(elite_a - elite_b)
                    if nearest is None or dist < nearest:
                        nearest = dist
            if nearest is not None:
                d.append(nearest)
            nearest = None
        return np.mean(d)

    def distance_to_nearest_neighbor(elite_events, events):
        nearest = None
        for elite_a in elite_events:
            dist = np.linalg.norm(elite_a - events)
            if nearest is None or dist < nearest:
                nearest = dist
        return nearest

    def add_to_archive(frame, episode_length):
        #print("Final rewards: ", final_rewards.numpy())
        fitness = final_rewards.numpy().mean()
        #print("raw: ", final_events.numpy())
        behavior = final_events.numpy().mean(axis=0)
        #print("Fitness:", fitness)
        #print("Behavior:", behavior)
        neighbors = event_buffer.get_neighbors(behavior, args.niche_divs,
                                               episode_length)

        add = len(neighbors) == 0
        for neighbor in neighbors:
            if fitness > neighbor.fitness:
                add = True
            else:
                add = False
                break

        if add:
            if len(neighbors) > 0:
                event_buffer.remove_elites(neighbors)
                #print(f"- Removing elites {[neighbor.elite_id for neighbor in neighbors]}")
            for neighbor in neighbors:
                try:
                    #print(f"- Deleting model {neighbor.elite_id}")
                    os.remove(
                        os.path.join(save_path, f"{neighbor.elite_id}.pt"))
                    #print("Successfully deleted model with id : ", neighbor.elite_id)
                except:
                    print("Error while deleting model with id : ",
                          neighbor.elite_id)
            name = str(uuid.uuid1())
            #print("Adding elite")
            event_buffer.add_elite(name, behavior, fitness, frame,
                                   episode_length)
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, f"{name}.pt"))

    # Create event buffer
    event_buffer = EventBufferSQLProxy(args.num_events,
                                       args.capacity,
                                       args.exp_id,
                                       args.agent_id,
                                       qd=args.qd,
                                       per_step=args.per_step)

    event_episode_rewards = []

    episode_finished = np.zeros(args.num_processes)

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    ir = event_buffer.intrinsic_reward(e)
                    if args.per_step:
                        ir = ir / 4200
                    intrinsic_reward.append(ir)
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            '''
            event_rewards = []
            for ei in range(0,args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                if args.per_step:
                    er = er / 4200
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)
            '''

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    #event_buffer.record_events(np.copy(final_events[i].numpy()), frame=j*args.num_steps*args.num_processes)
                    episode_length = (step +
                                      j * args.num_steps) - episode_finished[i]
                    episode_finished[i] = episode_length + episode_finished[i]
                    add_to_archive(
                        step * args.num_processes +
                        j * args.num_steps * args.num_processes,
                        episode_length)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        #final_episode_reward = np.mean(event_episode_rewards, axis=0)
        #event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )

            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())

            with open(os.path.join(log_path, log_file_name), "a") as myfile:
                myfile.write(log_to_file)

            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, f"{args.agent_id}.pt"))

            print(log)

    envs.close()
    time.sleep(5)