Пример #1
0
    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()

        logger.info("Initializing trainer manager interface")
        trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST,
                                                            port=MANAGER_PORT,
                                                            worker_id=full_experiment_name,
                                                            storage_client=trainer.storage_client,
                                                            minio_bucket_name=BUCKET_NAME)

        selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
            manager_interface=trainer.manager_interface,
            fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
            accepted_opponent_policy_class_names=ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
            accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
            add_payoff_matrix_noise_std_dev=0.0,
            mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF
        )

        if selection_probs is None:
            assert payoff_table is None
            assert payoff_table_key is None
            print("Payoff table is empty so using random weights for static policy.")
        else:
            print(f"Payoff table loaded from {payoff_table_key}")
            print(f"Policy selection probs: {selection_probs}")

        payoff_table_dill_str = dill.dumps(payoff_table)
        def worker_set_static_policy_distribution(worker):
            worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs
            worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str)
            worker.policy_map[STATIC_POLICY].current_policy_key = None

        trainer.workers.foreach_worker(worker_set_static_policy_distribution)
        def init_static_policy_distribution_after_trainer_init_callback(
                trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SEVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)

            logger.info("Initializing trainer lock server interface")
            trainer.lock_server_interface = LockServerInterface(
                server_host=LOCK_SERVER_HOST,
                port=LOCK_SERVER_PORT,
                worker_id=full_experiment_name)

            orig_selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
                manager_interface=trainer.manager_interface,
                fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
                accepted_opponent_policy_class_names=
                ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
                accepted_opponent_model_config_keys=
                ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
                add_payoff_matrix_noise_std_dev=0.0,
                mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)

            if orig_selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                selection_probs = None
                print(
                    "Payoff table is empty so using random weights for static policy."
                )
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Original Selection Probs: {orig_selection_probs}")

                policy_key_to_leave_out = get_unreserved_policy_key_with_priorities(
                    lock_server_interface=trainer.lock_server_interface,
                    policy_keys=payoff_table.get_ordered_keys_in_payoff_matrix(
                    ),
                    policy_priorities=orig_selection_probs)

                if policy_key_to_leave_out is None:
                    selection_probs = orig_selection_probs
                    print(
                        "No policy keys available to reserve so using unaltered selection probs"
                    )
                else:
                    chosen_policy_selection_prob = orig_selection_probs[
                        payoff_table.get_policy_spec_for_key(
                            policy_key_to_leave_out).get_payoff_matrix_index()]
                    print(
                        f"\n\nLeaving out {policy_key_to_leave_out}\n"
                        f"(Had selection prob of ({chosen_policy_selection_prob})\n\n"
                    )

                    selection_probs = get_fp_metanash_for_payoff_table(
                        payoff_table=payoff_table,
                        fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
                        accepted_opponent_policy_class_names=
                        ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
                        accepted_opponent_model_config_keys=
                        ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
                        add_payoff_matrix_noise_std_dev=0.0,
                        leave_out_indexes=[
                            payoff_table.get_policy_spec_for_key(
                                policy_key_to_leave_out).
                            get_payoff_matrix_index()
                        ],
                        mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)
                    print(f"Subset Selection Probs: {selection_probs}")

            if selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                print(
                    "Payoff table is empty so using random weights for static policy."
                )
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Policy selection probs: {selection_probs}")

            payoff_table_dill_str = dill.dumps(payoff_table)

            def worker_set_static_policy_distribution(worker):
                worker.policy_map[
                    STATIC_POLICY].static_policy_selection_probs = selection_probs
                worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(
                    payoff_table_dill_str)
                worker.policy_map[STATIC_POLICY].current_policy_key = None

            trainer.workers.foreach_worker(
                worker_set_static_policy_distribution)
Пример #3
0
def measure_exploitability_of_metanashes_as_they_become_available():
    logger = get_logger()

    storage_client = connect_storage_client()

    worker_id = f"Exploitability_Tracker_{gethostname()}_pid_{os.getpid()}_{datetime_str()}"

    manager_interface = ConsoleManagerInterface(
        server_host=MANAGER_SEVER_HOST,
        port=MANAGER_PORT,
        worker_id=worker_id,
        storage_client=storage_client,
        minio_bucket_name=BUCKET_NAME,
        minio_local_dir=DEFAULT_LOCAL_SAVE_PATH)

    logger.info(f"Started worker \'{worker_id}\'")

    # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main()
    ray.init(address=os.getenv('RAY_HEAD_NODE'),
             ignore_reinit_error=True,
             local_mode=True)

    model_config_file_path, _ = maybe_download_object(
        storage_client=storage_client,
        bucket_name=BUCKET_NAME,
        object_name=MODEL_CONFIG_KEY,
        force_download=False)

    with open(model_config_file_path, 'r') as config_file:
        model_config = json.load(fp=config_file)

    example_env = PokerMultiAgentEnv(env_config=POKER_ENV_CONFIG)

    logger.info("\n\n\n\n\n__________________________________________\n"
                f"LAUNCHED FOR {POKER_GAME_VERSION}\n"
                f"__________________________________________\n\n\n\n\n")

    obs_space = example_env.observation_space
    act_space = example_env.action_space

    preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space)
    graph = tf.Graph()
    sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}),
                      graph=graph)

    def fetch_logits(policy):
        return {
            "behaviour_logits": policy.model.last_output(),
        }

    _policy_cls = POLICY_CLASS.with_updates(
        extra_action_fetches_fn=fetch_logits)

    with graph.as_default():
        with sess.as_default():
            policy = _policy_cls(obs_space=preprocessor.observation_space,
                                 action_space=act_space,
                                 config=with_common_config({
                                     'model':
                                     with_base_config(
                                         base_config=MODEL_DEFAULTS,
                                         extra_config=model_config),
                                     'env':
                                     POKER_ENV,
                                     'env_config':
                                     POKER_ENV_CONFIG,
                                     'custom_preprocessor':
                                     STRATEGO_PREPROCESSOR
                                 }))

    def set_policy_weights(weights_key):
        weights_file_path, _ = maybe_download_object(
            storage_client=storage_client,
            bucket_name=BUCKET_NAME,
            object_name=weights_key,
            force_download=False)
        policy.load_model_weights(weights_file_path)

    print("(Started Successfully)")

    last_payoff_table_key = None
    while True:
        payoff_table, payoff_table_key = manager_interface.get_latest_payoff_table(
            infinite_retry_on_error=True)
        if payoff_table_key == last_payoff_table_key:
            time.sleep(20)
            continue
        last_payoff_table_key = payoff_table_key

        metanash_probs, _, _ = get_fp_metanash_for_latest_payoff_table(
            manager_interface=manager_interface,
            fp_iters=20000,
            accepted_opponent_policy_class_names=[POLICY_CLASS_NAME],
            accepted_opponent_model_config_keys=[POKER_ENV_CONFIG],
            add_payoff_matrix_noise_std_dev=0.000,
            mix_with_uniform_dist_coeff=None,
            p_or_lower_rounds_to_zero=0.0)

        if metanash_probs is not None:
            policy_weights_keys = payoff_table.get_ordered_keys_in_payoff_matrix(
            )

            policy_dict = {
                key: prob
                for key, prob in zip(policy_weights_keys, metanash_probs)
            }

            exploitabilitly = measure_exploitability_nonlstm(
                rllib_policy=policy,
                poker_game_version=POKER_GAME_VERSION,
                policy_mixture_dict=policy_dict,
                set_policy_weights_fn=set_policy_weights)
            print(f"Exploitability: {exploitabilitly}")
 print(f"Starting generation {generation_index}")
 active_job_list = []
 active_job_policy_keys = []
 if generation_index == 0:
     random_policy_key = "random"
     vs_random_job_str = _job_str_for_policy_key(
         policy_key=random_policy_key)
     assert lock_server_interface.try_to_reserve_item(
         item_name=vs_random_job_str)
     active_job_list.append(vs_random_job_str)
     active_job_policy_keys.append(random_policy_key)
 else:
     # get metanash probs and make jobs for non-zero policies
     selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
         manager_interface=manager_interface,
         fp_iters=20000,
         add_payoff_matrix_noise_std_dev=0.0,
         mix_with_uniform_dist_coeff=None)
     for policy_key, prob in zip(
             payoff_table.get_ordered_keys_in_payoff_matrix(),
             selection_probs):
         if prob > 0:
             job_str = _job_str_for_policy_key(policy_key=policy_key)
             assert lock_server_interface.try_to_reserve_item(
                 item_name=job_str)
             active_job_list.append(job_str)
             active_job_policy_keys.append(policy_key)
 print(
     f"\n\n\nLaunched the following jobs for generation {generation_index}:"
 )
 for job in active_job_list:
Пример #5
0
    base_experiment_name = f"{CLOUD_PREFIX}learner_{POKER_GAME_VERSION}_sac_arch1_hparam_search_multexp"
    full_experiment_name = f"{base_experiment_name}_{gethostname()}_pid_{os.getpid()}_{datetime_str()}"
    experiment_save_dir = os.path.join(DEFAULT_RESULTS_DIR,
                                       full_experiment_name)

    manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST,
                                                port=MANAGER_PORT,
                                                worker_id=full_experiment_name,
                                                storage_client=storage_client,
                                                minio_bucket_name=BUCKET_NAME)

    selection_probs, _, _ = get_fp_metanash_for_latest_payoff_table(
        manager_interface=manager_interface,
        fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
        accepted_opponent_policy_class_names=
        ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
        accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
        add_payoff_matrix_noise_std_dev=0.0,
        mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)

    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()

        logger.info("Initializing trainer manager interface")
        trainer.manager_interface = LearnerManagerInterface(
            server_host=MANAGER_SEVER_HOST,
            port=MANAGER_PORT,
            worker_id=full_experiment_name,
            storage_client=trainer.storage_client,
            minio_bucket_name=BUCKET_NAME)
        payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(