示例#1
0
def env_rand_gen_and_save(env_name,
                          num_attr_N=11,
                          num_attr_E=4,
                          T=10,
                          graphid=1,
                          numNodes=30,
                          numEdges=100,
                          numRoot=4,
                          numGoals=6,
                          history=3):
    env = Environment(num_attr_N=num_attr_N,
                      num_attr_E=num_attr_E,
                      T=T,
                      graphid=graphid,
                      numNodes=numNodes,
                      numEdges=numEdges,
                      numRoot=numRoot,
                      numGoals=numGoals,
                      history=history)
    env.randomDAG()
    path = osp.join(settings.get_env_data_dir(), "{}.pkl".format(env_name))
    print("env path is ", path)
    # if fp.isExist(path):
    #     raise ValueError("Env with such name already exists.")
    fp.save_pkl(env, path)
    print(env_name + " has been saved.")
    return env
示例#2
0
def env_rand_gen(env_name,
                 num_attr_N=11,
                 num_attr_E=4,
                 T=10,
                 graphid=1,
                 numNodes=30,
                 numEdges=100,
                 numRoot=5,
                 numGoals=6,
                 history=3):
    env = dag.Environment(num_attr_N=num_attr_N,
                          num_attr_E=num_attr_E,
                          T=T,
                          graphid=graphid,
                          numNodes=numNodes,
                          numEdges=numEdges,
                          numRoot=numRoot,
                          numGoals=numGoals,
                          history=history)
    # env.randomDAG()
    env.load_graph()
    path = os.getcwd() + "/env_data/" + env_name + ".pkl"
    print("env path is ", path)
    fp.save_pkl(env, path)
    print(env_name + " has been saved.")
    return env
示例#3
0
def run(load_env, env_name, n_processes):
    """ Run the double-oracle algorithm. """
    # Create initial policies.
    fp.save_pkl(
        uniform_str_init.act_att,
        osp.join(settings.get_attacker_strategy_dir(), "att_str_epoch1.pkl"))
    fp.save_pkl(
        uniform_str_init.act_def,
        osp.join(settings.get_defender_strategy_dir(), "def_str_epoch1.pkl"))

    game = initialize(load_env=FLAGS.env,
                      env_name=None,
                      n_processes=n_processes)
    _run(game.env,
         game,
         meta_method_name=FLAGS.meta_method,
         n_processes=n_processes)
def _train(policy_save_path, opponent, writer):
    env = GridWorldSoccer()
    env = MultiToSingleAgentWrapper(env=env,
                                    agent_id=1,
                                    opponents={2: opponent})

    save_path = osp.join(settings.get_run_dir(),
                         osp.basename(policy_save_path))
    save_path = save_path[:-4]  # Remove ".pkl".

    trainer = Trainer(policy_ctor=DQN)
    best_response, _, replay_buffer, _ = trainer.run(
        env=env, name=osp.basename(policy_save_path), writer=writer)

    # Save data to results folder for QMixture.
    torch.save(best_response, f"{save_path}.pkl", pickle_module=dill)
    fp.save_pkl(replay_buffer, f"{save_path}.replay_buffer.pkl")

    return best_response, replay_buffer
示例#5
0
def initialize(load_env=None, env_name=None, n_processes: int = 1):
    logger.info("=======================================================")
    logger.info("=======Begin Initialization and first epoch============")
    logger.info("=======================================================")

    # Create Environment
    if isinstance(load_env, str):
        path = osp.join(settings.get_env_data_dir(), "{}.pkl".format(load_env))
        if not fp.isExist(path):
            raise ValueError("The env being loaded does not exist.")
        env = fp.load_pkl(path)
    else:
        # env is created and saved.
        env = dag.env_rand_gen_and_save(env_name)

    # save graph copy
    env.save_graph_copy()
    env.save_mask_copy()  # TODO: change transfer

    # create players and point to their env
    env.create_players()
    env.create_action_space()

    # print root node
    roots = env.get_Roots()
    logger.info(f"Root Nodes: {roots}")
    ed = env.get_ORedges()
    logger.info(f"Or edges: {ed}")

    # initialize game data
    game = empirical_game.EmpiricalGame(env)
    game.env.defender.set_env_belong_to(game.env)
    game.env.attacker.set_env_belong_to(game.env)

    # make no sense
    env.defender.set_env_belong_to(env)
    env.attacker.set_env_belong_to(env)

    # uniform strategy has been produced ahead of time
    logger.info("Epoch 1")
    epoch = 1
    epoch_dir = osp.join(settings.get_results_dir(), f"epoch_{epoch}")
    writer = SummaryWriter(logdir=epoch_dir)

    act_att = 'att_str_epoch1.pkl'
    act_def = 'def_str_epoch1.pkl'

    game.add_att_str(act_att)
    game.add_def_str(act_def)

    logger.info('Begin simulation for uniform strategy.')
    aReward, dReward = simulation.simulate_profile(
        env=game.env,
        game=game,
        nn_att=act_att,
        nn_def=act_def,
        n_episodes=game.num_episodes,
        n_processes=n_processes,
        save_dir=epoch_dir,
        summary_writer=writer)
    logger.info('Done simulation for uniform strategy.')

    game.init_payoffmatrix(dReward, aReward)
    ne = {}
    ne[0] = np.array([1], dtype=np.float32)
    ne[1] = np.array([1], dtype=np.float32)
    game.add_nasheq(epoch, ne)

    # save a copy of game data
    game_path = osp.join(settings.get_run_dir(), "game.pkl")
    fp.save_pkl(game, game_path)

    sys.stdout.flush()
    return game
示例#6
0
def _run(env,
         game,
         meta_method_name,
         epoch: int = 1,
         game_path: str = None,
         n_processes: int = 1):
    assert n_processes > 0, "Invalid number of processors."
    if game_path is None:
        game_path = osp.join(settings.get_run_dir(), "game.pkl")

    logger.info("=======================================================")
    logger.info("===============Begin Running DO-EGTA===================")
    logger.info("=======================================================")

    proc = psutil.Process(os.getpid())
    result_dir = settings.get_run_dir()

    selector = meta_method_selector(meta_method_name)

    count = 80
    while count != 0:
        mem0 = proc.memory_info().rss

        # Fix opponent strategy.
        mix_str_def, mix_str_att = selector.sample(game, epoch)

        # Save mixed strategies.
        # with open(osp.join(result_dir, f"mix_defender.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(mix_str_def, outfile)
        # with open(osp.join(result_dir, f"mix_attacker.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(mix_str_att, outfile)
        # with open(osp.join(result_dir, f"payoff_defender.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(game.payoffmatrix_def, outfile)
        # with open(osp.join(result_dir, f"payoff_attacker.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(game.payoffmatrix_att, outfile)

        # Equilibrium pay-off.
        aPayoff, dPayoff = util.payoff_mixed_NE(game, epoch)
        game.att_payoff.append(aPayoff)
        game.def_payoff.append(dPayoff)

        # increase epoch
        epoch += 1
        logger.info("Epoch " + str(epoch))
        epoch_dir = osp.join(result_dir, f"epoch_{epoch}")

        # Summary writer for each epoch.
        writer = SummaryWriter(logdir=epoch_dir)

        # train and save RL agents

        # Train new best-response policies.
        if n_processes > 1:
            logger.info("Begining training attacker and defender in parallel.")
            time_training = time.time()
            job_queue = multiprocessing.SimpleQueue()
            result_queue = multiprocessing.SimpleQueue()

            attacker_trainer = LearnerWorker(job_queue, result_queue, 1,
                                             mix_str_def, epoch)
            defender_trainer = LearnerWorker(job_queue, result_queue, 0,
                                             mix_str_att, epoch)

            attacker_trainer.start()
            defender_trainer.start()

            # Submit training jobs on our game.
            for _ in range(2):
                job_queue.put(CloudpickleWrapper(game))
            # Send sentinel values to tell processes to cleanly shutdown (1 per worker).
            for _ in range(2):
                job_queue.put(None)

            attacker_trainer.join()
            defender_trainer.join()

            # Collect and report results. We need to sort the results because they may appear in any order.
            results = []
            for _ in range(2):
                results += [result_queue.get()]
            results = results if not results[0][
                0] else results[::-1]  # Put defender first then attacker.

            # Process results into expected variables for non-distributed.
            a_BD = results[1][1]
            d_BD = results[0][1]

            logger.info("Done training attacker and defender.")
            logger.info(f"Defender training report: \n{results[0][2]}")
            logger.info(f"Attacker training report: \n{results[1][2]}")
            time_training = time.time() - time_training

        else:
            logger.info("Begin training attacker......")
            time_train_attacker = time.time()
            a_BD, report = training.train(game, 1, mix_str_def, epoch, writer)
            time_train_attacker = time.time() - time_train_attacker
            logger.info(f"\n{report}")
            logger.info("Attacker training done......")

            logger.info("Begin training defender......")
            time_train_defender = time.time()
            d_BD, report = training.train(game, 0, mix_str_att, epoch, writer)
            time_train_defender = time.time() - time_train_defender
            logger.info(f"\n{report}")
            logger.info("Defender training done......")

        mem1 = proc.memory_info().rss

        game.att_BD_list.append(a_BD)
        game.def_BD_list.append(d_BD)

        mem2 = proc.memory_info().rss

        game.add_att_str("att_str_epoch" + str(epoch) + ".pkl")
        game.add_def_str("def_str_epoch" + str(epoch) + ".pkl")

        # simulate and extend the payoff matrix.
        time_extend_game = time.time()
        game = simulation.simulate_expanded_game(game=game,
                                                 n_processes=n_processes,
                                                 save_dir=epoch_dir,
                                                 summary_writer=writer)
        time_extend_game = time.time() - time_extend_game
        mem3 = proc.memory_info().rss

        # find nash equilibrium using gambit analysis
        time_gambit = time.time()
        payoffmatrix_def = game.payoffmatrix_def
        payoffmatrix_att = game.payoffmatrix_att
        logger.info("Begin Gambit analysis.")
        nash_att, nash_def = ga.do_gambit_analysis(payoffmatrix_def,
                                                   payoffmatrix_att)
        ga.add_new_NE(game, nash_att, nash_def, epoch)
        game.env.attacker.nn_att = None
        game.env.defender.nn_def = None
        fp.save_pkl(game, game_path)
        time_gambit = time.time() - time_gambit

        logger.info("RESULTS:")
        logger.info('  - a_BD_list: {}'.format(game.att_BD_list))
        logger.info('  - aPayoff: {}'.format(game.att_payoff))
        logger.info('  - d_BD_list: {}'.format(game.def_BD_list))
        logger.info('  - dPayoff: {}'.format(game.def_payoff))
        logger.info("MEM: {}, {}, {}.".format(
            (mem1 - mem0) / mem0, (mem2 - mem0) / mem0, (mem3 - mem0) / mem0))
        logger.info("TIME: ")
        if n_processes == 1:
            logger.info(f"  - Training attacker: {time_train_attacker}")
            logger.info(f"  - Training defender: {time_train_defender}")
        else:
            logger.info(f"  - Training: {time_training}")
        logger.info(f"  - Extend game: {time_extend_game}")
        logger.info(f"  - Gambit: {time_gambit}")
        logger.info("Round_" + str(epoch) + " has done and game was saved.")
        logger.info("=======================================================")

        count -= 1
        sys.stdout.flush()  # TODO: make sure this is correct.

    logger.info("END: " + str(epoch))
    os._exit(os.EX_OK)
示例#7
0
    def run(self, env, name, writer, **network_kwargs):
        """ Train a deepq model.

        :param env: Environment.
        :param name: Name of the training run, to save data seperately.
        :param writer: SummaryWriter for logging metrics.
        """
        time_init = time.time()

        # Create the new agent that we are going to train to best respond.
        best_responder = self.policy_ctor()

        # Set-up experience replay buffer.
        replay_buffer = ReplayBuffer(self.buffer_size)
        assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation."

        # Create exploration schedule.
        exploration = LinearSchedule(schedule_timesteps=int(
            self.exploration_fraction * self.total_timesteps),
                                     initial_p=self.exploration_initial_eps,
                                     final_p=self.exploration_final_eps)

        # Set-up training variables.
        mean_rewards = []
        episode_rewards = [0.0]
        saved_mean_reward = None

        # Begin episode.
        obs = env.reset()
        reset = True

        # Establish temporary directory to hold checkpoints of our agent from throughout training.
        # We do this so we can return the best version of our agent throughout training.
        temp_dir = tempfile.TemporaryDirectory()
        best_model_path = osp.join(temp_dir.name, "model.pytorch")

        # Time metrics.
        time_init = time.time() - time_init
        t_transitions = []
        t_actions = []
        t_steps = []
        t_samples = []
        t_updates = []
        n_updates = 0.0

        # Environment training loop.
        time_training = time.time()
        for t in range(self.total_timesteps):
            time_transition = time.time()

            # Check terminantion conditions.
            if self.callback is not None and self.callback(
                    locals(), globals()):
                break

            # Collect meta-data agent may need to compute action.
            time_action = time.time()
            action_kwargs = {}

            # Update exploration strategy.
            if self.param_noise:
                update_eps = 0.0
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps=exploration.value(t).
                # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017.
                update_param_noise_threshold = -1.0 * np.log(
                    1.0 - exploration.value(t) +
                    exploration.value(t) / float(env.action_space.n))
                action_kwargs["reset"] = reset
                action_kwargs[
                    "update_param_noise_threshold"] = update_param_noise_threshold
                action_kwargs["update_param_noise_scale"] = True

            else:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.0

            # Step agent.
            writer.add_scalar(f"{name}/epsilon", update_eps, t)
            action = best_responder.act(observation=np.array(obs)[None],
                                        stochastic=True,
                                        update_eps=update_eps,
                                        mask=None,
                                        training_attacker=False,
                                        **action_kwargs)[0]
            t_actions += [time.time() - time_action]

            # Step environment.
            time_step = time.time()
            new_obs, reward, done, _ = env.step(action)
            t_steps += [time.time() - time_step]

            # Store transition data.
            replay_buffer.add(obs, action, reward, new_obs, float(done))
            obs = new_obs
            episode_rewards[-1] += reward

            # If the environment finished, reset the environment and sample from opponent's meta-strategy.
            if done:
                obs = env.reset()
                # Log the environment reset.
                episode_rewards.append(0.0)
                reset = True

            # Periodically train our policy.
            if (t > self.learning_starts) and (t % self.train_freq == 0):
                n_updates += 1.0
                time_sample = time.time()
                # Collect batch (b) of experiences.
                b_o, b_a, b_r, b_op, b_d = replay_buffer.sample(
                    self.batch_size)
                b_weights = np.ones_like(b_r)

                t_samples += [time.time() - time_sample]

                time_update = time.time()
                best_responder.update(observations=b_o,
                                      actions=b_a,
                                      rewards=b_r,
                                      next_observations=b_op,
                                      done_mask=b_d,
                                      importance_weights=b_weights,
                                      summary_writer=writer,
                                      mask=None,
                                      training_attacker=False,
                                      t=t)
                t_updates += [time.time() - time_update]

            # Periodically update target network.
            if (t > self.learning_starts) and (
                    t % self.target_network_update_freq == 0):
                best_responder.update_target_network()

            # Record results.
            n_episodes = len(episode_rewards)
            if t > self.learning_starts:
                mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1)
                mean_rewards.append(mean_100ep_reward)
                writer.add_scalar(f"{name}/mean_reward",
                                  np.nan_to_num(mean_100ep_reward), t)

            # Periodically save a snapshot of our best-responder.
            if (self.checkpoint_freq
                    is not None) and (t > self.learning_starts) and (
                        n_episodes > 100) and (t % self.checkpoint_freq == 0):
                # Save checkpoints of only the best-performing model we have encountered.
                if (saved_mean_reward is None) or (mean_100ep_reward >
                                                   saved_mean_reward):
                    torch.save(best_responder,
                               best_model_path,
                               pickle_module=dill)
                    saved_mean_reward = mean_100ep_reward

            t_transitions += [time.time() - time_transition]

        # Load the best-performing encountered policy as our resulting best-responder.
        BD = None
        if osp.exists(best_model_path):
            best_responder = torch.load(best_model_path)
            BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward

        # Clean-up temporary directory.
        temp_dir.cleanup()

        # Save data to generate learning curves.
        data_path = osp.join(settings.get_run_dir(),
                             f"mean_rewards.{name}.pkl")
        fp.save_pkl(mean_rewards, data_path)

        # Log timing statistics.
        # We put this together into a string to send back to have the main process print.
        # This is to prevent potential multiprocessing errors.
        report = ""
        report += "  - n_transitions: {}\n".format(len(t_transitions))
        report += "  - n_updates: {}\n".format(len(t_updates))
        report += "  - t_init: {}\n".format(time_init)
        report += "  - t_transitions: {}\n".format(np.mean(t_transitions))
        report += "  - t_actions: {}\n".format(np.mean(t_actions))
        report += "  - t_steps: {}\n".format(np.mean(t_steps))
        report += "  - t_samples: {}\n".format(np.mean(t_samples))
        report += "  - t_updates: {}\n".format(np.mean(t_updates))

        return best_responder, BD, replay_buffer, report
示例#8
0
    def learn_multi_nets(self, env, epoch, writer, **network_kwargs):
        """ Train a deepq model.

        :param env: Environment.
        :param epoch: Current EGTA epoch. This is only used for saving results.
        :param writer: SummaryWriter for logging metrics.
        """
        time_init = time.time()
        # If the training flag is 1 we're training the attacker, or the defender if the flag is 0.
        training_attacker = env.training_flag
        assert training_attacker == 0 or training_attacker == 1, f"Invalid training flag: {training_attacker}."

        log_prefix = "attacker" if training_attacker else "defender"

        # Select parameters based off attacker/defender.
        n_actions = env.act_dim_att(
        ) if training_attacker else env.act_dim_def()
        observation_space = env.obs_dim_att(
        ) if training_attacker else env.obs_dim_def()

        # Create the new agent that we are going to train to best respond.
        best_responder = self.get_new_policy(locals_=locals(),
                                             globals_=globals())

        # Set-up experience replay buffer.
        replay_buffer = ReplayBuffer(self.buffer_size)
        assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation."

        # Create exploration schedule.
        exploration = LinearSchedule(schedule_timesteps=int(
            self.exploration_fraction * self.total_timesteps),
                                     initial_p=self.exploration_initial_eps,
                                     final_p=self.exploration_final_eps)

        # Set-up training variables.
        mean_rewards = []
        episode_rewards = [0.0]
        saved_mean_reward = None

        # Begin episode.
        obs = env.reset_everything_with_return()
        reset = True

        # Sample our initial opponent's strategy.
        opponent_sampler = OpponentSampler(
            env=env, opponent_identity=0 if training_attacker else 1)
        opponent_sampler.sample()

        # Establish temporary directory to hold checkpoints of our agent from throughout training.
        # We do this so we can return the best version of our agent throughout training.
        temp_dir = tempfile.TemporaryDirectory()
        best_model_path = osp.join(temp_dir.name, "model.pytorch")

        # Time metrics.
        time_init = time.time() - time_init
        t_transitions = []
        t_actions = []
        t_steps = []
        t_samples = []
        t_updates = []
        n_updates = 0.0

        # Reward Shaping
        temp_buffer = []

        # Environment training loop.
        time_training = time.time()
        for t in range(self.total_timesteps):
            time_transition = time.time()

            # Check terminantion conditions.
            if self.callback is not None and self.callback(
                    locals(), globals()):
                break

            # Collect meta-data agent may need to compute action.
            time_action = time.time()
            action_kwargs = {}

            # Update exploration strategy.
            if self.param_noise:
                update_eps = 0.0
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps=exploration.value(t).
                # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017.
                update_param_noise_threshold = -1.0 * np.log(
                    1.0 - exploration.value(t) +
                    exploration.value(t) / float(env.action_space.n))
                action_kwargs["reset"] = reset
                action_kwargs[
                    "update_param_noise_threshold"] = update_param_noise_threshold
                action_kwargs["update_param_noise_scale"] = True

            else:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.0

            # If we are the attacker, apply a mask to our action space.
            if training_attacker:
                mask = mask_generator_att(env, np.array(obs)[None])
            else:
                mask = None

            # Step agent.
            writer.add_scalar(f"{log_prefix}/epsilon", update_eps, t)
            action = best_responder.act(observation=np.array(obs)[None],
                                        stochastic=True,
                                        update_eps=update_eps,
                                        mask=mask,
                                        training_attacker=training_attacker,
                                        **action_kwargs)[0]
            t_actions += [time.time() - time_action]

            # Step environment.
            time_step = time.time()
            new_obs, reward, done = env.step(action)
            t_steps += [time.time() - time_step]

            # Store transition data.
            # Reward shaping
            if self.reward_shaping:
                pass_flag = False
                if training_attacker == 0:
                    rewards_shaping = env.rewards()
                    if rewards_shaping['pass_flag']:
                        for transition in temp_buffer:
                            obs0, action0, rew0, new_obs0, done0 = transition
                            rew_new = rewards_shaping[str(action0)].v
                            episode_rewards[-1] += rew_new
                            replay_buffer.add(obs0, action0, rew_new, new_obs0,
                                              done0)
                        temp_buffer = []
                        env.reset_reward_shaping()
                        pass_flag = True
                elif training_attacker == 1:
                    rewards_shaping = env.rewards()
                    if rewards_shaping['pass_flag']:
                        for transition in temp_buffer:
                            obs1, action1, rew1, new_obs1, done1 = transition
                            rew_new = rewards_shaping[str(action1)].v
                            episode_rewards[-1] += rew_new
                            replay_buffer.add(obs1, action1, rew_new, new_obs1,
                                              done1)
                        temp_buffer = []
                        env.reset_reward_shaping()
                        pass_flag = True

                if pass_flag:
                    episode_rewards[-1] += reward
                    replay_buffer.add(obs, action, reward, new_obs,
                                      float(done))
                else:
                    temp_buffer.append(
                        (obs, action, reward, new_obs, float(done)))

                obs = new_obs

                if done:
                    obs = env.reset_everything_with_return()
                    episode_rewards.append(0.0)
                    reset = True
                    # sample a new strategy from meta-stategy solver.
                    opponent_sampler.sample()

            # No reward shaping.
            else:
                replay_buffer.add(obs, action, reward, new_obs, float(done))
                obs = new_obs
                episode_rewards[-1] += reward

                # If the environment finished, reset the environment and sample from opponent's meta-strategy.
                if done:
                    obs = env.reset_everything_with_return()
                    opponent_sampler.sample()

                    # Log the environment reset.
                    episode_rewards.append(0.0)
                    reset = True

            # Periodically train our policy.
            if (t > self.learning_starts) and (t % self.train_freq == 0):
                n_updates += 1.0
                time_sample = time.time()
                # Collect batch (b) of experiences.
                b_o, b_a, b_r, b_op, b_d = replay_buffer.sample(
                    self.batch_size)
                b_weights = np.ones_like(b_r)

                # Generate action masks.
                if training_attacker:
                    b_mask = mask_generator_att(env, b_op)
                else:
                    b_mask = None

                t_samples += [time.time() - time_sample]

                time_update = time.time()
                best_responder.update(observations=b_o,
                                      actions=b_a,
                                      rewards=b_r,
                                      next_observations=b_op,
                                      done_mask=b_d,
                                      importance_weights=b_weights,
                                      mask=b_mask,
                                      training_attacker=training_attacker,
                                      summary_writer=writer,
                                      t=t)
                t_updates += [time.time() - time_update]

            # Periodically update target network.
            if (t > self.learning_starts) and (
                    t % self.target_network_update_freq == 0):
                best_responder.update_target_network()

            # Record results.
            n_episodes = len(episode_rewards)
            if t > self.learning_starts:
                mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1)
                mean_rewards.append(mean_100ep_reward)
                writer.add_scalar(f"{log_prefix}/mean_reward",
                                  np.nan_to_num(mean_100ep_reward), t)

            # Periodically save a snapshot of our best-responder.
            if (self.checkpoint_freq
                    is not None) and (t > self.learning_starts) and (
                        n_episodes > 100) and (t % self.checkpoint_freq == 0):
                # Save checkpoints of only the best-performing model we have encountered.
                if (saved_mean_reward is None) or (mean_100ep_reward >
                                                   saved_mean_reward):
                    torch.save(best_responder,
                               best_model_path,
                               pickle_module=dill)
                    saved_mean_reward = mean_100ep_reward

            t_transitions += [time.time() - time_transition]

        # Load the best-performing encountered policy as our resulting best-responder.
        BD = None
        if osp.exists(best_model_path):
            best_responder = torch.load(best_model_path)
            BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward

        # Clean-up temporary directory.
        temp_dir.cleanup()

        # Save data to generate learning curves.
        name = "attacker" if training_attacker else "defender"
        data_path = osp.join(settings.get_run_dir(),
                             f"mean_rewards.{name}.{epoch}.pkl")
        fp.save_pkl(mean_rewards, data_path)

        # Log timing statistics.
        # We put this together into a string to send back to have the main process print.
        # This is to prevent potential multiprocessing errors.
        report = ""
        report += "  - n_transitions: {}\n".format(len(t_transitions))
        report += "  - n_updates: {}\n".format(len(t_updates))
        report += "  - t_init: {}\n".format(time_init)
        report += "  - t_transitions: {}\n".format(np.mean(t_transitions))
        report += "  - t_actions: {}\n".format(np.mean(t_actions))
        report += "  - t_steps: {}\n".format(np.mean(t_steps))
        report += "  - t_samples: {}\n".format(np.mean(t_samples))
        report += "  - t_updates: {}\n".format(np.mean(t_updates))

        return best_responder, BD, replay_buffer, report