Exemplo n.º 1
0
    def run(self):
        # Reparse the flags for this process.
        FLAGS = flags.FLAGS
        FLAGS(sys.argv)
        # Reload gin configurations for this process.
        gin_files = [
            osp.join(settings.SRC_DIR, "configs", f"{x}.gin")
            for x in FLAGS.config_files
        ]
        gin.parse_config_files_and_bindings(config_files=gin_files,
                                            bindings=FLAGS.config_overrides,
                                            skip_unknown=False)

        for job in iter(self.job_queue.get, None):
            game = job()
            writer = SummaryWriter(
                logdir=osp.join(settings.get_run_dir(), f"epoch_{self.epoch}"))

            best_deviation, report = training.train(
                game=game,
                identity=self.is_attacker,
                opponent_mix_str=self.opponent_mixed_strategy,
                epoch=self.epoch,
                writer=writer)

            self.result_queue.put((self.is_attacker, best_deviation, report))
def _get_gambit_nash_path():
    """ Get path to Gambit's payoff matrix.

    :return: Filepath.
    :rtype: str
    """
    gambit_path = osp.join(settings.get_run_dir(), "nash.txt")
    return gambit_path
Exemplo n.º 3
0
def test_dqn_cartpole():
    """ Test DQN.

    References:
     - https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/configs/dqn_cartpole.gin
    """
    from attackgraph.rl.dqn import DQN
    import wandb

    flags.DEFINE_string("run_name", "test_dqn_cartpole", "")
    FLAGS = flags.FLAGS
    FLAGS(sys.argv)

    wandb.init(
        project="test_dqn_cartpole",
        dir=settings.get_run_dir(),
        resume=False)

    def _policy_factory(*args, **kwargs):
        """ Generate new policy. """
        return DQN(
            is_attacker=True,
            state_dim=4,
            hidden_sizes=[8, 4],
            action_dim=2,
            lr=0.0001)

    env = gym.make("CartPole-v0")
    env.seed(0)
    env = EnvToAttackGraph(env)

    trainer = Learner(
        seed=0,
        # Policy.
        get_new_policy=_policy_factory,
        exploration_fraction=0.4,
        exploration_final_eps=0.01,
        # Time.
        total_timesteps=400000,
        learning_starts=500,
        train_freq=4,
        target_network_update_freq=10000,
        gamma=0.9,
        # Replay buffer.
        batch_size=512,
        buffer_size=1000000)
    trainer.learn_multi_nets(env, epoch=0)
Exemplo n.º 4
0
    def run(self):
        # Because we are "spawning" the process instead of "forking" the process, we need to
        # reimport the run's configurations.
        # Reparse the flags for this process.
        FLAGS = flags.FLAGS
        FLAGS(sys.argv)
        # Reload gin configurations for this process.
        gin_files = [
            osp.join(settings.SRC_DIR, "configs", f"{x}.gin")
            for x in FLAGS.config_files
        ]
        gin.parse_config_files_and_bindings(config_files=gin_files,
                                            bindings=FLAGS.config_overrides,
                                            skip_unknown=False)

        policy_name = "attacker" if self.train_attacker else "defender"

        for job in iter(self.job_queue.get, None):
            # The game we're given has no policies and has not been initialized.
            game, opponent = job
            game = game()  # Unpickle game.

            # Register the opponent we will be playing as the opponent's only policy.
            if self.train_attacker:
                game.add_def_str(opponent)
            else:
                game.add_att_str(opponent)

            # The opponent sampling is done from the result directory, so we need
            # to copy any model we use into the policy set.
            if self.train_attacker:
                opponent_dir = settings.get_defender_strategy_dir()
            else:
                opponent_dir = settings.get_attacker_strategy_dir()
            new_filepath = osp.join(opponent_dir, osp.basename(opponent))
            shutil.copyfile(src=opponent, dst=new_filepath)

            save_path = osp.join(settings.get_run_dir(),
                                 osp.basename(opponent))
            save_path = save_path[:-4]  # Remove ".pkl".
            training.train(game=game,
                           identity=int(self.train_attacker),
                           opponent_mix_str=np.array([1.0]),
                           epoch=osp.basename(opponent),
                           writer=SummaryWriter(logdir=save_path),
                           save_path=osp.join(save_path, f"{policy_name}.pkl"))
Exemplo n.º 5
0
def _train(policy_save_path, opponent, writer):
    env = GridWorldSoccer()
    env = MultiToSingleAgentWrapper(env=env,
                                    agent_id=1,
                                    opponents={2: opponent})

    save_path = osp.join(settings.get_run_dir(),
                         osp.basename(policy_save_path))
    save_path = save_path[:-4]  # Remove ".pkl".

    trainer = Trainer(policy_ctor=DQN)
    best_response, _, replay_buffer, _ = trainer.run(
        env=env, name=osp.basename(policy_save_path), writer=writer)

    # Save data to results folder for QMixture.
    torch.save(best_response, f"{save_path}.pkl", pickle_module=dill)
    fp.save_pkl(replay_buffer, f"{save_path}.replay_buffer.pkl")

    return best_response, replay_buffer
def _train_classifier(classifier, buffer_paths, mixture, env,
                      test_split: float, training_attacker: bool):
    """ Train an opponent classifier. """
    # Load all the replay buffers and merge/split them.
    logger.info(f"Loading replay buffers from: ")
    labels = []
    replay_buffers = []
    for buffer_i, path in enumerate(buffer_paths):
        logger.info(f"  - {path}")
        replay_buffers += [fp.load_pkl(path)]
        labels += [np.ones([len(replay_buffers[-1])]) * buffer_i]
    replay_buffer = merge_replay_buffers(replay_buffers)
    # We only want the state.
    replay_buffer = [x[0] for x in replay_buffer._storage]
    replay_buffer = np.array(replay_buffer)
    labels = np.ravel(labels)

    assert replay_buffer.shape[0] == labels.shape[0]

    # Shuffle the data.
    new_indices = np.random.permutation(len(labels))
    replay_buffer = replay_buffer[new_indices]
    labels = labels[new_indices]

    # Train/test split.
    n_test_data = int(len(labels) * test_split)

    # Train the opponent classifier.
    classifier = supervised_learning(net=classifier,
                                     train_X=replay_buffer[:-n_test_data],
                                     train_Y=labels[:-n_test_data],
                                     test_X=replay_buffer[-n_test_data:],
                                     test_Y=labels[-n_test_data:],
                                     criterion=gin.REQUIRED,
                                     n_epochs=gin.REQUIRED,
                                     eval_freq=gin.REQUIRED,
                                     batch_size=gin.REQUIRED,
                                     log_dir=settings.get_run_dir())
    return student
Exemplo n.º 7
0
def main(argv):
    """ Run evaluation script.

    :param argv: Command line arguments.
    """
    # Configure information displayed to terminal.
    np.set_printoptions(precision=2)
    warnings.filterwarnings("ignore")

    # Set-up the result directory.
    run_dir = settings.get_run_dir()
    if osp.exists(run_dir):
        print("Cannot resume previously saved run, overwriting data.")
    else:
        os.mkdir(run_dir)

    # Set-up logging.
    logger = logging.getLogger("attackgraph")
    logger.setLevel(logging.INFO)
    logger.propagate = False
    logger.handlers = []  # absl has a default handler that we need to remove.
    # logger.propagate = False
    formatter = logging.Formatter(
        "%(asctime)s %(name)s %(levelname)s %(message)s")
    # Log to terminal.
    terminal_handler = logging.StreamHandler()
    terminal_handler.setFormatter(formatter)
    # Log to file.
    file_handler = logging.FileHandler(osp.join(run_dir, "out.log"))
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    # Debug output.
    debug_handler = logging.FileHandler(osp.join(run_dir, "debug.log"))
    debug_handler.setLevel(logging.DEBUG)
    debug_handler.setFormatter(formatter)
    # Register handlers.
    logger.addHandler(terminal_handler)
    logger.addHandler(file_handler)
    logger.addHandler(debug_handler)

    logger.info(f"Saving results to: {run_dir}")

    # Set-up gin configuration.
    gin_files = [
        osp.join(settings.SRC_DIR, "configs", f"{x}.gin")
        for x in FLAGS.config_files
    ]
    gin.parse_config_files_and_bindings(config_files=gin_files,
                                        bindings=FLAGS.config_overrides,
                                        skip_unknown=False)

    # Save program flags.
    with open(osp.join(run_dir, "flags.txt"), "w") as flag_file:
        # We want only flags relevant to this module to be saved, no extra flags.
        # See: https://github.com/abseil/abseil-py/issues/92
        key_flags = FLAGS.get_key_flags_for_module(argv[0])
        key_flags = "\n".join(flag.serialize() for flag in key_flags)
        flag_file.write(key_flags)
    with open(osp.join(run_dir, "config.txt"), "w") as config_file:
        config_file.write(gin.config_str())

    # Properly restrict pytorch to not consume extra resources.
    #  - https://github.com/pytorch/pytorch/issues/975
    #  - https://github.com/ray-project/ray/issues/3609
    torch.set_num_threads(1)
    os.environ["OMP_NUM_THREADS"] = "1"

    evaluate_qmix([
        player2_policies.Player2v0(),
        player2_policies.Player2v1(),
        player2_policies.Player2v2(),
        player2_policies.Player2v3(),
        player2_policies.Player2v4()
    ])
Exemplo n.º 8
0
def evaluate_qmix(opponents: typing.List, mixture: typing.List):
    """ . """
    assert len(opponents) == len(mixture)
    name = "player1"
    env = GridWorldSoccer()

    # -------------------------------------------------------------------------
    # Train best-response to each pure-strategy opponent.
    logger.info("Training best-response against each pure-strategy.")
    best_responses = []
    replay_buffers = []
    best_response_paths = []
    for opponent_i, opponent in enumerate(opponents):
        logger.info(f"  - Training against opponent {opponent_i}")
        br_path = osp.join(settings.get_run_dir(),
                           f"v{opponent_i}.best_response.pkl")
        best_response_paths += [br_path]
        with gin.config_scope("pure"):
            response, replay_buffer = _train(
                br_path, opponent,
                SummaryWriter(logdir=osp.join(settings.get_run_dir(),
                                              f"br_vs_{opponent_i}")))
        best_responses += [response]
        replay_buffers += [replay_buffer]

    # -------------------------------------------------------------------------
    # Simulate the performance of QMixture.
    logger.info("Simulating the performance of the QMixture.")
    qmix = QMixture(mixture=mixture, q_funcs=best_responses)

    # Save policy, for future evaluation.
    qmix_path = osp.join(settings.get_run_dir(), "qmix.pkl")
    torch.save(qmix, qmix_path, pickle_module=dill)

    qmix_rewards = []
    mixed_reward = 0.0
    reward_std = 0.0
    for opponent_i, opponent in enumerate(opponents):
        rewards, _ = simulate_profile(env=env,
                                      nn_att=qmix,
                                      nn_def=opponent,
                                      n_episodes=250,
                                      save_dir=None,
                                      summary_writer=None,
                                      raw_rewards=True)

        logger.info(
            f"  - Opponent {opponent_i} vs. QMix: {np.mean(rewards)}, {np.std(rewards)}"
        )
        qmix_rewards += [rewards]
        mixed_reward += mixture[opponent_i] * np.mean(rewards)
        reward_std += mixture[opponent_i]**2 * np.std(rewards)**2
    reward_std = np.sqrt(reward_std)
    logger.info(
        f"Expected reward against mixture opponent: {mixed_reward}, {reward_std}"
    )
    dill.dump(
        mixed_reward,
        open(osp.join(settings.get_run_dir(), "qmix.simulated_reward.pkl"),
             "wb"))

    # -------------------------------------------------------------------------
    # Simulate the performance of QMixture with state frequencies.
    """
    logger.info("Simulating the performance of the QMixture with State-Frequency weighting.")
    qmix_statefreq = QMixtureStateFreq(mixture=mixture, q_funcs=best_responses, replay_buffers=replay_buffers)

    # Save policy, for future evaluation.
    qmix_statefreq_path = osp.join(settings.get_run_dir(), "qmix_statefreq.pkl")
    torch.save(qmix_statefreq, qmix_statefreq_path, pickle_module=dill)

    qmix_statefreq_rewards = []
    mixed_statefreq_reward = 0.0
    for opponent_i, opponent in enumerate(opponents):
        rewards, _ = simulate_profile(
            env=env,
            nn_att=qmix_statefreq,
            nn_def=opponent,
            n_episodes=250,
            save_dir=None,
            summary_writer=SummaryWriter(logdir=osp.join(settings.get_run_dir(), f"simulate_statefreq_vs_{opponent_i}")),
            raw_rewards=True)

        logger.info(f"  - Opponent {opponent_i}: {np.mean(rewards)}, {np.std(rewards)}")
        with open(osp.join(settings.get_run_dir(), f"qmix_statefreq.rewards_v{opponent_i}.pkl"), "wb") as outfile:
            dill.dump(rewards, outfile)
        qmix_statefreq_rewards += [rewards]
        mixed_statefreq_reward += mixture[opponent_i] * np.mean(rewards)
    logger.info(f"Expected reward against mixture opponent: {mixed_statefreq_reward}")
    dill.dump(mixed_reward, open(osp.join(settings.get_run_dir(), "qmix_statefreq.simulated_reward.pkl"), "wb"))
    """
    # -------------------------------------------------------------------------
    # Train best-response to opponent mixture.
    logger.info("Training a best-response against the mixture opponent.")
    mixture_br_path = osp.join(settings.get_run_dir(),
                               "mixture.best_response.pkl")
    opponent_agent = Agent(mixture=mixture, policies=opponents)

    with gin.config_scope("mix"):
        mixture_br, _ = _train(
            mixture_br_path, opponent_agent,
            SummaryWriter(
                logdir=osp.join(settings.get_run_dir(), "br_vs_mixture")))

    # -------------------------------------------------------------------------
    # Evaluate the mixture policy against the individual opponent strategies.
    logger.info(
        "Evaluating the best-response trained against mixture opponents on pure-strategy opponents."
    )

    mix_br_reward = 0.0
    reward_std = 0.0
    for opponent_i, opponent in enumerate(opponents):
        rewards, _ = simulate_profile(env=env,
                                      nn_att=mixture_br,
                                      nn_def=opponent,
                                      n_episodes=250,
                                      save_dir=None,
                                      summary_writer=None,
                                      raw_rewards=True)

        logger.info(
            f"  - Opponent {opponent_i} vs. MixtureBR: {np.mean(rewards)}, {np.std(rewards)}"
        )
        mix_br_reward += mixture[opponent_i] * np.mean(rewards)
        reward_std += mixture[opponent_i]**2 * np.std(rewards)**2
    reward_std = np.sqrt(reward_std)
    logger.info(
        f"Expected reward for mixture best-response: {mix_br_reward}, {reward_std}"
    )

    # -------------------------------------------------------------------------
    # Evaluate pure-strategy-best-response policies against all opponents (all pure strategy + mixture).
    logger.info(
        "Evaluating pure-strategy-best-response against all opponent policies."
    )

    response_rewards = {}
    response_std = {}
    for opponent_i, opponent in enumerate(opponents):
        for response_i, best_response in enumerate(best_responses):
            rewards, _ = simulate_profile(env=env,
                                          nn_att=best_response,
                                          nn_def=opponent,
                                          n_episodes=250,
                                          save_dir=None,
                                          summary_writer=None,
                                          raw_rewards=True)

            logger.info(
                f"  - Opponent {opponent_i} vs. Best-Response {response_i}: {np.mean(rewards)}, {np.std(rewards)}"
            )
            if response_i not in response_rewards:
                response_rewards[response_i] = 0.0
                response_std[response_i] = 0.0
            response_rewards[response_i] += mixture[opponent_i] * np.mean(
                rewards)
            response_std[response_i] += mixture[opponent_i]**2 * np.std(
                rewards)**2

    for key, value in response_rewards.items():
        logger.info(
            f"Expected reward of response {key} against mixture: {value}, {np.sqrt(response_std[key])}"
        )
    logger.info("Finished.")
Exemplo n.º 9
0
def initialize(load_env=None, env_name=None, n_processes: int = 1):
    logger.info("=======================================================")
    logger.info("=======Begin Initialization and first epoch============")
    logger.info("=======================================================")

    # Create Environment
    if isinstance(load_env, str):
        path = osp.join(settings.get_env_data_dir(), "{}.pkl".format(load_env))
        if not fp.isExist(path):
            raise ValueError("The env being loaded does not exist.")
        env = fp.load_pkl(path)
    else:
        # env is created and saved.
        env = dag.env_rand_gen_and_save(env_name)

    # save graph copy
    env.save_graph_copy()
    env.save_mask_copy()  # TODO: change transfer

    # create players and point to their env
    env.create_players()
    env.create_action_space()

    # print root node
    roots = env.get_Roots()
    logger.info(f"Root Nodes: {roots}")
    ed = env.get_ORedges()
    logger.info(f"Or edges: {ed}")

    # initialize game data
    game = empirical_game.EmpiricalGame(env)
    game.env.defender.set_env_belong_to(game.env)
    game.env.attacker.set_env_belong_to(game.env)

    # make no sense
    env.defender.set_env_belong_to(env)
    env.attacker.set_env_belong_to(env)

    # uniform strategy has been produced ahead of time
    logger.info("Epoch 1")
    epoch = 1
    epoch_dir = osp.join(settings.get_results_dir(), f"epoch_{epoch}")
    writer = SummaryWriter(logdir=epoch_dir)

    act_att = 'att_str_epoch1.pkl'
    act_def = 'def_str_epoch1.pkl'

    game.add_att_str(act_att)
    game.add_def_str(act_def)

    logger.info('Begin simulation for uniform strategy.')
    aReward, dReward = simulation.simulate_profile(
        env=game.env,
        game=game,
        nn_att=act_att,
        nn_def=act_def,
        n_episodes=game.num_episodes,
        n_processes=n_processes,
        save_dir=epoch_dir,
        summary_writer=writer)
    logger.info('Done simulation for uniform strategy.')

    game.init_payoffmatrix(dReward, aReward)
    ne = {}
    ne[0] = np.array([1], dtype=np.float32)
    ne[1] = np.array([1], dtype=np.float32)
    game.add_nasheq(epoch, ne)

    # save a copy of game data
    game_path = osp.join(settings.get_run_dir(), "game.pkl")
    fp.save_pkl(game, game_path)

    sys.stdout.flush()
    return game
Exemplo n.º 10
0
def _run(env,
         game,
         meta_method_name,
         epoch: int = 1,
         game_path: str = None,
         n_processes: int = 1):
    assert n_processes > 0, "Invalid number of processors."
    if game_path is None:
        game_path = osp.join(settings.get_run_dir(), "game.pkl")

    logger.info("=======================================================")
    logger.info("===============Begin Running DO-EGTA===================")
    logger.info("=======================================================")

    proc = psutil.Process(os.getpid())
    result_dir = settings.get_run_dir()

    selector = meta_method_selector(meta_method_name)

    count = 80
    while count != 0:
        mem0 = proc.memory_info().rss

        # Fix opponent strategy.
        mix_str_def, mix_str_att = selector.sample(game, epoch)

        # Save mixed strategies.
        # with open(osp.join(result_dir, f"mix_defender.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(mix_str_def, outfile)
        # with open(osp.join(result_dir, f"mix_attacker.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(mix_str_att, outfile)
        # with open(osp.join(result_dir, f"payoff_defender.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(game.payoffmatrix_def, outfile)
        # with open(osp.join(result_dir, f"payoff_attacker.{epoch}.pkl"), "wb") as outfile:
        #     pickle.dump(game.payoffmatrix_att, outfile)

        # Equilibrium pay-off.
        aPayoff, dPayoff = util.payoff_mixed_NE(game, epoch)
        game.att_payoff.append(aPayoff)
        game.def_payoff.append(dPayoff)

        # increase epoch
        epoch += 1
        logger.info("Epoch " + str(epoch))
        epoch_dir = osp.join(result_dir, f"epoch_{epoch}")

        # Summary writer for each epoch.
        writer = SummaryWriter(logdir=epoch_dir)

        # train and save RL agents

        # Train new best-response policies.
        if n_processes > 1:
            logger.info("Begining training attacker and defender in parallel.")
            time_training = time.time()
            job_queue = multiprocessing.SimpleQueue()
            result_queue = multiprocessing.SimpleQueue()

            attacker_trainer = LearnerWorker(job_queue, result_queue, 1,
                                             mix_str_def, epoch)
            defender_trainer = LearnerWorker(job_queue, result_queue, 0,
                                             mix_str_att, epoch)

            attacker_trainer.start()
            defender_trainer.start()

            # Submit training jobs on our game.
            for _ in range(2):
                job_queue.put(CloudpickleWrapper(game))
            # Send sentinel values to tell processes to cleanly shutdown (1 per worker).
            for _ in range(2):
                job_queue.put(None)

            attacker_trainer.join()
            defender_trainer.join()

            # Collect and report results. We need to sort the results because they may appear in any order.
            results = []
            for _ in range(2):
                results += [result_queue.get()]
            results = results if not results[0][
                0] else results[::-1]  # Put defender first then attacker.

            # Process results into expected variables for non-distributed.
            a_BD = results[1][1]
            d_BD = results[0][1]

            logger.info("Done training attacker and defender.")
            logger.info(f"Defender training report: \n{results[0][2]}")
            logger.info(f"Attacker training report: \n{results[1][2]}")
            time_training = time.time() - time_training

        else:
            logger.info("Begin training attacker......")
            time_train_attacker = time.time()
            a_BD, report = training.train(game, 1, mix_str_def, epoch, writer)
            time_train_attacker = time.time() - time_train_attacker
            logger.info(f"\n{report}")
            logger.info("Attacker training done......")

            logger.info("Begin training defender......")
            time_train_defender = time.time()
            d_BD, report = training.train(game, 0, mix_str_att, epoch, writer)
            time_train_defender = time.time() - time_train_defender
            logger.info(f"\n{report}")
            logger.info("Defender training done......")

        mem1 = proc.memory_info().rss

        game.att_BD_list.append(a_BD)
        game.def_BD_list.append(d_BD)

        mem2 = proc.memory_info().rss

        game.add_att_str("att_str_epoch" + str(epoch) + ".pkl")
        game.add_def_str("def_str_epoch" + str(epoch) + ".pkl")

        # simulate and extend the payoff matrix.
        time_extend_game = time.time()
        game = simulation.simulate_expanded_game(game=game,
                                                 n_processes=n_processes,
                                                 save_dir=epoch_dir,
                                                 summary_writer=writer)
        time_extend_game = time.time() - time_extend_game
        mem3 = proc.memory_info().rss

        # find nash equilibrium using gambit analysis
        time_gambit = time.time()
        payoffmatrix_def = game.payoffmatrix_def
        payoffmatrix_att = game.payoffmatrix_att
        logger.info("Begin Gambit analysis.")
        nash_att, nash_def = ga.do_gambit_analysis(payoffmatrix_def,
                                                   payoffmatrix_att)
        ga.add_new_NE(game, nash_att, nash_def, epoch)
        game.env.attacker.nn_att = None
        game.env.defender.nn_def = None
        fp.save_pkl(game, game_path)
        time_gambit = time.time() - time_gambit

        logger.info("RESULTS:")
        logger.info('  - a_BD_list: {}'.format(game.att_BD_list))
        logger.info('  - aPayoff: {}'.format(game.att_payoff))
        logger.info('  - d_BD_list: {}'.format(game.def_BD_list))
        logger.info('  - dPayoff: {}'.format(game.def_payoff))
        logger.info("MEM: {}, {}, {}.".format(
            (mem1 - mem0) / mem0, (mem2 - mem0) / mem0, (mem3 - mem0) / mem0))
        logger.info("TIME: ")
        if n_processes == 1:
            logger.info(f"  - Training attacker: {time_train_attacker}")
            logger.info(f"  - Training defender: {time_train_defender}")
        else:
            logger.info(f"  - Training: {time_training}")
        logger.info(f"  - Extend game: {time_extend_game}")
        logger.info(f"  - Gambit: {time_gambit}")
        logger.info("Round_" + str(epoch) + " has done and game was saved.")
        logger.info("=======================================================")

        count -= 1
        sys.stdout.flush()  # TODO: make sure this is correct.

    logger.info("END: " + str(epoch))
    os._exit(os.EX_OK)
Exemplo n.º 11
0
    def run(self, env, name, writer, **network_kwargs):
        """ Train a deepq model.

        :param env: Environment.
        :param name: Name of the training run, to save data seperately.
        :param writer: SummaryWriter for logging metrics.
        """
        time_init = time.time()

        # Create the new agent that we are going to train to best respond.
        best_responder = self.policy_ctor()

        # Set-up experience replay buffer.
        replay_buffer = ReplayBuffer(self.buffer_size)
        assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation."

        # Create exploration schedule.
        exploration = LinearSchedule(schedule_timesteps=int(
            self.exploration_fraction * self.total_timesteps),
                                     initial_p=self.exploration_initial_eps,
                                     final_p=self.exploration_final_eps)

        # Set-up training variables.
        mean_rewards = []
        episode_rewards = [0.0]
        saved_mean_reward = None

        # Begin episode.
        obs = env.reset()
        reset = True

        # Establish temporary directory to hold checkpoints of our agent from throughout training.
        # We do this so we can return the best version of our agent throughout training.
        temp_dir = tempfile.TemporaryDirectory()
        best_model_path = osp.join(temp_dir.name, "model.pytorch")

        # Time metrics.
        time_init = time.time() - time_init
        t_transitions = []
        t_actions = []
        t_steps = []
        t_samples = []
        t_updates = []
        n_updates = 0.0

        # Environment training loop.
        time_training = time.time()
        for t in range(self.total_timesteps):
            time_transition = time.time()

            # Check terminantion conditions.
            if self.callback is not None and self.callback(
                    locals(), globals()):
                break

            # Collect meta-data agent may need to compute action.
            time_action = time.time()
            action_kwargs = {}

            # Update exploration strategy.
            if self.param_noise:
                update_eps = 0.0
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps=exploration.value(t).
                # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017.
                update_param_noise_threshold = -1.0 * np.log(
                    1.0 - exploration.value(t) +
                    exploration.value(t) / float(env.action_space.n))
                action_kwargs["reset"] = reset
                action_kwargs[
                    "update_param_noise_threshold"] = update_param_noise_threshold
                action_kwargs["update_param_noise_scale"] = True

            else:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.0

            # Step agent.
            writer.add_scalar(f"{name}/epsilon", update_eps, t)
            action = best_responder.act(observation=np.array(obs)[None],
                                        stochastic=True,
                                        update_eps=update_eps,
                                        mask=None,
                                        training_attacker=False,
                                        **action_kwargs)[0]
            t_actions += [time.time() - time_action]

            # Step environment.
            time_step = time.time()
            new_obs, reward, done, _ = env.step(action)
            t_steps += [time.time() - time_step]

            # Store transition data.
            replay_buffer.add(obs, action, reward, new_obs, float(done))
            obs = new_obs
            episode_rewards[-1] += reward

            # If the environment finished, reset the environment and sample from opponent's meta-strategy.
            if done:
                obs = env.reset()
                # Log the environment reset.
                episode_rewards.append(0.0)
                reset = True

            # Periodically train our policy.
            if (t > self.learning_starts) and (t % self.train_freq == 0):
                n_updates += 1.0
                time_sample = time.time()
                # Collect batch (b) of experiences.
                b_o, b_a, b_r, b_op, b_d = replay_buffer.sample(
                    self.batch_size)
                b_weights = np.ones_like(b_r)

                t_samples += [time.time() - time_sample]

                time_update = time.time()
                best_responder.update(observations=b_o,
                                      actions=b_a,
                                      rewards=b_r,
                                      next_observations=b_op,
                                      done_mask=b_d,
                                      importance_weights=b_weights,
                                      summary_writer=writer,
                                      mask=None,
                                      training_attacker=False,
                                      t=t)
                t_updates += [time.time() - time_update]

            # Periodically update target network.
            if (t > self.learning_starts) and (
                    t % self.target_network_update_freq == 0):
                best_responder.update_target_network()

            # Record results.
            n_episodes = len(episode_rewards)
            if t > self.learning_starts:
                mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1)
                mean_rewards.append(mean_100ep_reward)
                writer.add_scalar(f"{name}/mean_reward",
                                  np.nan_to_num(mean_100ep_reward), t)

            # Periodically save a snapshot of our best-responder.
            if (self.checkpoint_freq
                    is not None) and (t > self.learning_starts) and (
                        n_episodes > 100) and (t % self.checkpoint_freq == 0):
                # Save checkpoints of only the best-performing model we have encountered.
                if (saved_mean_reward is None) or (mean_100ep_reward >
                                                   saved_mean_reward):
                    torch.save(best_responder,
                               best_model_path,
                               pickle_module=dill)
                    saved_mean_reward = mean_100ep_reward

            t_transitions += [time.time() - time_transition]

        # Load the best-performing encountered policy as our resulting best-responder.
        BD = None
        if osp.exists(best_model_path):
            best_responder = torch.load(best_model_path)
            BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward

        # Clean-up temporary directory.
        temp_dir.cleanup()

        # Save data to generate learning curves.
        data_path = osp.join(settings.get_run_dir(),
                             f"mean_rewards.{name}.pkl")
        fp.save_pkl(mean_rewards, data_path)

        # Log timing statistics.
        # We put this together into a string to send back to have the main process print.
        # This is to prevent potential multiprocessing errors.
        report = ""
        report += "  - n_transitions: {}\n".format(len(t_transitions))
        report += "  - n_updates: {}\n".format(len(t_updates))
        report += "  - t_init: {}\n".format(time_init)
        report += "  - t_transitions: {}\n".format(np.mean(t_transitions))
        report += "  - t_actions: {}\n".format(np.mean(t_actions))
        report += "  - t_steps: {}\n".format(np.mean(t_steps))
        report += "  - t_samples: {}\n".format(np.mean(t_samples))
        report += "  - t_updates: {}\n".format(np.mean(t_updates))

        return best_responder, BD, replay_buffer, report
Exemplo n.º 12
0
    def learn_multi_nets(self, env, epoch, writer, **network_kwargs):
        """ Train a deepq model.

        :param env: Environment.
        :param epoch: Current EGTA epoch. This is only used for saving results.
        :param writer: SummaryWriter for logging metrics.
        """
        time_init = time.time()
        # If the training flag is 1 we're training the attacker, or the defender if the flag is 0.
        training_attacker = env.training_flag
        assert training_attacker == 0 or training_attacker == 1, f"Invalid training flag: {training_attacker}."

        log_prefix = "attacker" if training_attacker else "defender"

        # Select parameters based off attacker/defender.
        n_actions = env.act_dim_att(
        ) if training_attacker else env.act_dim_def()
        observation_space = env.obs_dim_att(
        ) if training_attacker else env.obs_dim_def()

        # Create the new agent that we are going to train to best respond.
        best_responder = self.get_new_policy(locals_=locals(),
                                             globals_=globals())

        # Set-up experience replay buffer.
        replay_buffer = ReplayBuffer(self.buffer_size)
        assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation."

        # Create exploration schedule.
        exploration = LinearSchedule(schedule_timesteps=int(
            self.exploration_fraction * self.total_timesteps),
                                     initial_p=self.exploration_initial_eps,
                                     final_p=self.exploration_final_eps)

        # Set-up training variables.
        mean_rewards = []
        episode_rewards = [0.0]
        saved_mean_reward = None

        # Begin episode.
        obs = env.reset_everything_with_return()
        reset = True

        # Sample our initial opponent's strategy.
        opponent_sampler = OpponentSampler(
            env=env, opponent_identity=0 if training_attacker else 1)
        opponent_sampler.sample()

        # Establish temporary directory to hold checkpoints of our agent from throughout training.
        # We do this so we can return the best version of our agent throughout training.
        temp_dir = tempfile.TemporaryDirectory()
        best_model_path = osp.join(temp_dir.name, "model.pytorch")

        # Time metrics.
        time_init = time.time() - time_init
        t_transitions = []
        t_actions = []
        t_steps = []
        t_samples = []
        t_updates = []
        n_updates = 0.0

        # Reward Shaping
        temp_buffer = []

        # Environment training loop.
        time_training = time.time()
        for t in range(self.total_timesteps):
            time_transition = time.time()

            # Check terminantion conditions.
            if self.callback is not None and self.callback(
                    locals(), globals()):
                break

            # Collect meta-data agent may need to compute action.
            time_action = time.time()
            action_kwargs = {}

            # Update exploration strategy.
            if self.param_noise:
                update_eps = 0.0
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps=exploration.value(t).
                # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017.
                update_param_noise_threshold = -1.0 * np.log(
                    1.0 - exploration.value(t) +
                    exploration.value(t) / float(env.action_space.n))
                action_kwargs["reset"] = reset
                action_kwargs[
                    "update_param_noise_threshold"] = update_param_noise_threshold
                action_kwargs["update_param_noise_scale"] = True

            else:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.0

            # If we are the attacker, apply a mask to our action space.
            if training_attacker:
                mask = mask_generator_att(env, np.array(obs)[None])
            else:
                mask = None

            # Step agent.
            writer.add_scalar(f"{log_prefix}/epsilon", update_eps, t)
            action = best_responder.act(observation=np.array(obs)[None],
                                        stochastic=True,
                                        update_eps=update_eps,
                                        mask=mask,
                                        training_attacker=training_attacker,
                                        **action_kwargs)[0]
            t_actions += [time.time() - time_action]

            # Step environment.
            time_step = time.time()
            new_obs, reward, done = env.step(action)
            t_steps += [time.time() - time_step]

            # Store transition data.
            # Reward shaping
            if self.reward_shaping:
                pass_flag = False
                if training_attacker == 0:
                    rewards_shaping = env.rewards()
                    if rewards_shaping['pass_flag']:
                        for transition in temp_buffer:
                            obs0, action0, rew0, new_obs0, done0 = transition
                            rew_new = rewards_shaping[str(action0)].v
                            episode_rewards[-1] += rew_new
                            replay_buffer.add(obs0, action0, rew_new, new_obs0,
                                              done0)
                        temp_buffer = []
                        env.reset_reward_shaping()
                        pass_flag = True
                elif training_attacker == 1:
                    rewards_shaping = env.rewards()
                    if rewards_shaping['pass_flag']:
                        for transition in temp_buffer:
                            obs1, action1, rew1, new_obs1, done1 = transition
                            rew_new = rewards_shaping[str(action1)].v
                            episode_rewards[-1] += rew_new
                            replay_buffer.add(obs1, action1, rew_new, new_obs1,
                                              done1)
                        temp_buffer = []
                        env.reset_reward_shaping()
                        pass_flag = True

                if pass_flag:
                    episode_rewards[-1] += reward
                    replay_buffer.add(obs, action, reward, new_obs,
                                      float(done))
                else:
                    temp_buffer.append(
                        (obs, action, reward, new_obs, float(done)))

                obs = new_obs

                if done:
                    obs = env.reset_everything_with_return()
                    episode_rewards.append(0.0)
                    reset = True
                    # sample a new strategy from meta-stategy solver.
                    opponent_sampler.sample()

            # No reward shaping.
            else:
                replay_buffer.add(obs, action, reward, new_obs, float(done))
                obs = new_obs
                episode_rewards[-1] += reward

                # If the environment finished, reset the environment and sample from opponent's meta-strategy.
                if done:
                    obs = env.reset_everything_with_return()
                    opponent_sampler.sample()

                    # Log the environment reset.
                    episode_rewards.append(0.0)
                    reset = True

            # Periodically train our policy.
            if (t > self.learning_starts) and (t % self.train_freq == 0):
                n_updates += 1.0
                time_sample = time.time()
                # Collect batch (b) of experiences.
                b_o, b_a, b_r, b_op, b_d = replay_buffer.sample(
                    self.batch_size)
                b_weights = np.ones_like(b_r)

                # Generate action masks.
                if training_attacker:
                    b_mask = mask_generator_att(env, b_op)
                else:
                    b_mask = None

                t_samples += [time.time() - time_sample]

                time_update = time.time()
                best_responder.update(observations=b_o,
                                      actions=b_a,
                                      rewards=b_r,
                                      next_observations=b_op,
                                      done_mask=b_d,
                                      importance_weights=b_weights,
                                      mask=b_mask,
                                      training_attacker=training_attacker,
                                      summary_writer=writer,
                                      t=t)
                t_updates += [time.time() - time_update]

            # Periodically update target network.
            if (t > self.learning_starts) and (
                    t % self.target_network_update_freq == 0):
                best_responder.update_target_network()

            # Record results.
            n_episodes = len(episode_rewards)
            if t > self.learning_starts:
                mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1)
                mean_rewards.append(mean_100ep_reward)
                writer.add_scalar(f"{log_prefix}/mean_reward",
                                  np.nan_to_num(mean_100ep_reward), t)

            # Periodically save a snapshot of our best-responder.
            if (self.checkpoint_freq
                    is not None) and (t > self.learning_starts) and (
                        n_episodes > 100) and (t % self.checkpoint_freq == 0):
                # Save checkpoints of only the best-performing model we have encountered.
                if (saved_mean_reward is None) or (mean_100ep_reward >
                                                   saved_mean_reward):
                    torch.save(best_responder,
                               best_model_path,
                               pickle_module=dill)
                    saved_mean_reward = mean_100ep_reward

            t_transitions += [time.time() - time_transition]

        # Load the best-performing encountered policy as our resulting best-responder.
        BD = None
        if osp.exists(best_model_path):
            best_responder = torch.load(best_model_path)
            BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward

        # Clean-up temporary directory.
        temp_dir.cleanup()

        # Save data to generate learning curves.
        name = "attacker" if training_attacker else "defender"
        data_path = osp.join(settings.get_run_dir(),
                             f"mean_rewards.{name}.{epoch}.pkl")
        fp.save_pkl(mean_rewards, data_path)

        # Log timing statistics.
        # We put this together into a string to send back to have the main process print.
        # This is to prevent potential multiprocessing errors.
        report = ""
        report += "  - n_transitions: {}\n".format(len(t_transitions))
        report += "  - n_updates: {}\n".format(len(t_updates))
        report += "  - t_init: {}\n".format(time_init)
        report += "  - t_transitions: {}\n".format(np.mean(t_transitions))
        report += "  - t_actions: {}\n".format(np.mean(t_actions))
        report += "  - t_steps: {}\n".format(np.mean(t_steps))
        report += "  - t_samples: {}\n".format(np.mean(t_samples))
        report += "  - t_updates: {}\n".format(np.mean(t_updates))

        return best_responder, BD, replay_buffer, report