Exemplo n.º 1
0
    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()

        logger.info("Initializing trainer manager interface")
        trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST,
                                                            port=MANAGER_PORT,
                                                            worker_id=full_experiment_name,
                                                            storage_client=trainer.storage_client,
                                                            minio_bucket_name=BUCKET_NAME)

        selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
            manager_interface=trainer.manager_interface,
            fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
            accepted_opponent_policy_class_names=ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
            accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
            add_payoff_matrix_noise_std_dev=0.0,
            mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF
        )

        if selection_probs is None:
            assert payoff_table is None
            assert payoff_table_key is None
            print("Payoff table is empty so using random weights for static policy.")
        else:
            print(f"Payoff table loaded from {payoff_table_key}")
            print(f"Policy selection probs: {selection_probs}")

        payoff_table_dill_str = dill.dumps(payoff_table)
        def worker_set_static_policy_distribution(worker):
            worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs
            worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str)
            worker.policy_map[STATIC_POLICY].current_policy_key = None

        trainer.workers.foreach_worker(worker_set_static_policy_distribution)
        def _do_live_policy_checkpoint(trainer, training_iteration):
            local_train_policy = trainer.workers.local_worker(
            ).policy_map[TRAIN_POLICY]
            checkpoints_dir = os.path.join(experiment_save_dir,
                                           "policy_checkpoints")
            checkpoint_name = f"policy_{trainer.claimed_policy_num}_{datetime_str()}_iter_{training_iteration}.dill"
            checkpoint_save_path = os.path.join(checkpoints_dir,
                                                checkpoint_name)
            local_train_policy.save_model_weights(
                save_file_path=checkpoint_save_path,
                remove_scope_prefix=TRAIN_POLICY)
            policy_key = os.path.join(base_experiment_name,
                                      full_experiment_name,
                                      "policy_checkpoints", checkpoint_name)
            storage_client = connect_storage_client()
            upload_file(storage_client=storage_client,
                        bucket_name=BUCKET_NAME,
                        object_key=policy_key,
                        local_source_path=checkpoint_save_path)

            locks_checkpoint_name = f"dch_population_checkpoint_{datetime_str()}"

            ray_get_and_free(
                trainer.live_table_tracker.set_latest_key_for_claimed_policy.
                remote(
                    new_key=policy_key,
                    request_locks_checkpoint_with_name=locks_checkpoint_name))
Exemplo n.º 3
0
    def __init__(self, config):
        # Stop event for server handler threads to signal this thread that it's time to shutdown
        self._stop_event = threading.Event()
        self._grpc_port = config['grpc_port']
        self._grpc_server = grpc.server(
            futures.ThreadPoolExecutor(
                max_workers=config['num_thread_workers']))

        self._storage_client = connect_storage_client()

        self._root_save_dir = config['logs_and_payoff_table_save_key_prefix'] \
            .replace("DATETIMESTR", datetime_str()) \
            .replace("HOSTNAME", gethostname()) \
            .replace("PID", str(os.getpid()))
        self._root_save_dir = f"{CLOUD_PREFIX}{self._root_save_dir}"

        logger.info(f"root save key prefix is {self._root_save_dir}")

        self._payoff_table_save_dir = os.path.join(self._root_save_dir,
                                                   "payoff_tables")

        servicer = _PopulationServerServicerImpl(
            stop_event=self._stop_event,
            payoff_table_save_key_prefix_dir=self._payoff_table_save_dir,
            storage_client=self._storage_client,
            bucket_name=BUCKET_NAME,
            max_ping_interval_seconds_to_track_workers=config[
                'max_ping_interval_seconds_to_track_workers'],
            num_games_to_play_for_matchup_evals=config[
                'games_per_eval_matchup'],
            restore_from_payoff_table_key=config[
                'restore_from_payoff_table_key'])
        add_PopulationServerServicer_to_server(servicer=servicer,
                                               server=self._grpc_server)
        self._grpc_server.add_insecure_port(f'[::]:{self._grpc_port}')
Exemplo n.º 4
0
        def sample_new_static_policy_weights_for_each_worker_on_episode_start(
                params):
            policies = params['policy']

            static_policy = policies[STATIC_POLICY]

            if static_policy.static_policy_selection_probs is None:
                return

            selected_policy_index = np.random.choice(
                a=list(range(len(
                    static_policy.static_policy_selection_probs))),
                p=static_policy.static_policy_selection_probs)
            selected_policy_spec: PolicySpec = static_policy.payoff_table.get_policy_for_index(
                selected_policy_index)
            assert selected_policy_spec.class_name in ACCEPTED_OPPONENT_POLICY_CLASS_NAMES
            assert selected_policy_spec.config_key in ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS

            if static_policy.current_policy_key != selected_policy_spec.key:
                # print(f"sampled policy {selected_policy_spec.key} (loading weights)")
                storage_client = connect_storage_client()
                weights_local_path, _ = maybe_download_object(
                    storage_client=storage_client,
                    bucket_name=BUCKET_NAME,
                    object_name=selected_policy_spec.key,
                    force_download=False)
                static_policy.load_model_weights(
                    load_file_path=weights_local_path,
                    add_scope_prefix=STATIC_POLICY)
                static_policy.current_policy_key = selected_policy_spec.key
Exemplo n.º 5
0
    def init_static_policy_distribution_after_trainer_init_callback(trainer):
        trainer.storage_client = connect_storage_client()

        logger.info("Initializing trainer manager interface")
        trainer.manager_interface = LearnerManagerInterface(
            server_host=MANAGER_SEVER_HOST,
            port=MANAGER_PORT,
            worker_id=full_experiment_name,
            storage_client=trainer.storage_client,
            minio_bucket_name=BUCKET_NAME)
        payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(
        )

        if selection_probs is None:
            assert payoff_table is None
            assert payoff_table_key is None
            print(
                "Payoff table is empty so using random weights for static policy."
            )
        else:
            print(f"Payoff table loaded from {payoff_table_key}")
            print(f"Policy selection probs: {selection_probs}")

        payoff_table_dill_str = dill.dumps(payoff_table)

        def worker_set_static_policy_distribution(worker):
            worker.policy_map[
                STATIC_POLICY].static_policy_selection_probs = selection_probs
            worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(
                payoff_table_dill_str)
            worker.policy_map[STATIC_POLICY].current_policy_key = None

        trainer.workers.foreach_worker(worker_set_static_policy_distribution)
        def claim_new_active_policy_after_trainer_init_callback(trainer):
            def set_train_policy_warmup_target_entropy_proportion(worker):
                worker.policy_map[TRAIN_POLICY].set_target_entropy_proportion(
                    PIPELINE_WARMUP_ENTROPY_TARGET_PROPORTION)

            trainer.workers.foreach_worker(
                set_train_policy_warmup_target_entropy_proportion)

            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SERVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)

            trainer.live_table_tracker = LivePolicyPayoffTracker.remote(
                minio_endpoint=MINIO_ENDPOINT,
                minio_access_key=MINIO_ACCESS_KEY,
                minio_secret_key=MINIO_SECRET_KEY,
                minio_bucket=BUCKET_NAME,
                manager_host=MANAGER_SERVER_HOST,
                manager_port=MANAGER_PORT,
                lock_server_host=LOCK_SERVER_HOST,
                lock_server_port=LOCK_SERVER_PORT,
                worker_id=full_experiment_name,
                policy_class_name=TRAIN_POLICY_CLASS.__name__,
                policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY,
                provide_payoff_barrier_sync=
                not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS)
            trainer.claimed_policy_num = ray_get_and_free(
                trainer.live_table_tracker.get_claimed_policy_num.remote())
            trainer.are_all_lower_policies_finished = False
            trainer.payoff_table_needs_update_started = False
            trainer.payoff_table = None
            _do_live_policy_checkpoint(trainer=trainer, training_iteration=0)

            if not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS:
                # wait for all other learners to also reach this point before continuing
                ray_get_and_free(trainer.live_table_tracker.
                                 wait_at_barrier_for_other_learners.remote())

            trainer.new_payoff_table_promise = trainer.live_table_tracker.get_live_payoff_table_dill_pickled.remote(
                first_wait_for_n_seconds=2)
            _process_new_live_payoff_table_result_if_ready(
                trainer=trainer, block_until_result_is_ready=True)

            if INIT_FROM_POPULATION:
                init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(
                    trainer=trainer)
            else:
                print(
                    colored(
                        f"Policy {trainer.claimed_policy_num}: (Initializing train policy to random)",
                        "white"))
Exemplo n.º 7
0
        def init_static_policy_distribution_after_trainer_init_callback(
                trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SEVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)
 def set_policy_weights(weights_key):
     print(f"weights are {weights_key}")
     storage_client = connect_storage_client()
     weights_file_path, _ = maybe_download_object(
         storage_client=storage_client,
         bucket_name=BUCKET_NAME,
         object_name=weights_key,
         force_download=False)
     print("got weights")
     local_exploit_rllib_policy.load_model_weights(
         weights_file_path, add_scope_prefix=STATIC_POLICY)
        def init_static_policy_distribution_after_trainer_init_callback(trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SERVER_HOST,
                                                                port=MANAGER_PORT,
                                                                worker_id=full_experiment_name,
                                                                storage_client=trainer.storage_client,
                                                                minio_bucket_name=BUCKET_NAME)

            trainer.lock_server_interface = LockServerInterface(server_host=LOCK_SERVER_HOST,
                                                    port=LOCK_SERVER_PORT,
                                                    worker_id=f"rectified_psro_learner_{gethostname()}_pid_{os.getpid()}")

            payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(infinite_retry_on_error=True)
            if payoff_table is None:
                assert job_init_policy_key == 'random'
                assert payoff_table_key is None
                selection_probs = None
                print(colored(
                    f"Payoff table is empty so using random weights for static policy.", "white"))
            else:
                assert job_init_policy_key != 'random'
                policies_str = ""
                for policy_key in payoff_table.get_ordered_keys_in_payoff_matrix():
                    policies_str += f"{policy_key}"
                print(colored(
                    f"Payoff Table Policies: {colored(policies_str, 'white')}\n",
                    "white"))

                selection_probs = get_rectified_selection_probs_for_policy_key(payoff_table=payoff_table,
                                                                               policy_key=job_init_policy_key,
                                                                               fp_iters=METANASH_FICTITIOUS_PLAY_ITERS)
                print(colored(f"Rectified Policy selection probs: {selection_probs}", "white"))

            if selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                print("Payoff table is empty so using random weights for static policy.")
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Policy selection probs: {selection_probs}")

            payoff_table_dill_str = dill.dumps(payoff_table)

            def worker_set_static_policy_distribution(worker):
                worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs
                worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str)
                worker.policy_map[STATIC_POLICY].current_policy_key = None

            trainer.workers.foreach_worker(worker_set_static_policy_distribution)
Exemplo n.º 10
0
    def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(trainer):

        storage_client = connect_storage_client()
        weights_local_path, _ = maybe_download_object(storage_client=storage_client,
                                                      bucket_name=BUCKET_NAME,
                                                      object_name="learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0/learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0_sage_pid_29557_11.47.05PM_May-20-2020/policy_submissions/12.00.49AM_May-21-2020_iter_2263.dill",
                                                      force_download=False)

        def worker_set_train_policy_weights(worker):
            train_policy = worker.policy_map[TRAIN_POLICY]
            train_policy.load_model_weights(load_file_path=weights_local_path,
                                            add_scope_prefix=TRAIN_POLICY)

        trainer.workers.foreach_worker(worker_set_train_policy_weights)
        def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(
                trainer):
            local_static_policy = trainer.workers.local_worker(
            ).policy_map[STATIC_POLICY]
            local_train_policy = trainer.workers.local_worker(
            ).policy_map[TRAIN_POLICY]
            if not hasattr(local_static_policy, 'static_policy_selection_probs') or \
                    local_static_policy.static_policy_selection_probs is None:
                print(
                    colored(
                        f"Policy {trainer.claimed_policy_num}: Payoff table is empty so Initializing train policy to random",
                        "white"))
                local_train_policy.init_tag = "init from random"
                return

            selected_policy_index = np.random.choice(
                a=list(
                    range(
                        len(local_static_policy.static_policy_selection_probs))
                ),
                p=local_static_policy.static_policy_selection_probs)
            selected_policy_spec: PolicySpec = local_static_policy.payoff_table.get_policy_for_index(
                selected_policy_index)
            local_train_policy.init_tag = f"full init from {selected_policy_spec.key}"

            # may not necessarily be true in all scripts
            assert selected_policy_spec.class_name == TRAIN_POLICY_CLASS.__name__
            assert selected_policy_spec.config_key == TRAIN_POLICY_MODEL_CONFIG_KEY
            storage_client = connect_storage_client()
            weights_local_path, _ = maybe_download_object(
                storage_client=storage_client,
                bucket_name=BUCKET_NAME,
                object_name=selected_policy_spec.key,
                force_download=False)

            print(
                colored(
                    f"Policy {trainer.claimed_policy_num}: Initializing train policy to {selected_policy_spec.key}",
                    "white"))

            # TODO: Here
            def worker_set_train_policy_weights(worker):
                train_policy = worker.policy_map[TRAIN_POLICY]
                train_policy.load_model_weights(
                    load_file_path=weights_local_path,
                    add_scope_prefix=TRAIN_POLICY)

            trainer.workers.foreach_worker(worker_set_train_policy_weights)
Exemplo n.º 12
0
    def __init__(
            self,
            cache_size=0,
            record_file_path=None,
            new_record_entry_every_n_seconds=DEFAULT_RECORD_ENTRY_INTERVAL_SECONDS,
            extra_data_keys=None):

        self.catalog = {}
        self.storage_client = connect_storage_client()
        self.bucket_name = BUCKET_NAME

        self.cache = OrderedDict()
        self.cache_size = cache_size

        self.record_file_path = record_file_path
        self.new_record_entry_every_n_seconds = new_record_entry_every_n_seconds
        self.start_time = time.time()
        self.last_record_entry_time = self.start_time
        self.extra_data_keys = extra_data_keys or []
Exemplo n.º 13
0
    def __init__(self,
                 minio_bucket,
                 manager_host,
                 manager_port,
                 lock_server_host,
                 lock_server_port,
                 worker_id,
                 policy_class_name,
                 policy_config_key,
                 provide_payoff_barrier_sync=False):
        worker_id = f"live_pop_tracker_{worker_id[worker_id.find('pid'):]}"
        self._storage_client = connect_storage_client()
        self._minio_bucket = minio_bucket
        self._manager_interface = ConsoleManagerInterface(
            server_host=manager_host,
            port=manager_port,
            worker_id=worker_id,
            storage_client=self._storage_client,
            minio_bucket_name=self._minio_bucket)
        self._lock_interface = LockServerInterface(
            server_host=lock_server_host,
            port=lock_server_port,
            worker_id=worker_id)
        self._policy_class_name = policy_class_name
        self._policy_config_key = policy_config_key
        self._claimed_policy_num = None
        self._claim_new_active_policy()
        assert self._claimed_policy_num is not None

        self._locally_cached_matchup_results = {}

        self._provide_payoff_barrier_sync = provide_payoff_barrier_sync
        if self._provide_payoff_barrier_sync:
            self._wait_at_payoff_table_barrier_fn, self._leave_barrier_group_fn = self._lock_interface.join_barrier_group(
                barrier_name="pt_barrier",
                member_name=str(self._claimed_policy_num),
                grace_period_for_others_to_join_s=20.0)
        else:
            self._wait_at_payoff_table_barrier_fn = None
            self._leave_barrier_group_fn = None
Exemplo n.º 14
0
def measure_exploitability_of_metanashes_as_they_become_available():
    logger = get_logger()

    storage_client = connect_storage_client()

    worker_id = f"Exploitability_Tracker_{gethostname()}_pid_{os.getpid()}_{datetime_str()}"

    manager_interface = ConsoleManagerInterface(
        server_host=MANAGER_SEVER_HOST,
        port=MANAGER_PORT,
        worker_id=worker_id,
        storage_client=storage_client,
        minio_bucket_name=BUCKET_NAME,
        minio_local_dir=DEFAULT_LOCAL_SAVE_PATH)

    logger.info(f"Started worker \'{worker_id}\'")

    # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main()
    ray.init(address=os.getenv('RAY_HEAD_NODE'),
             ignore_reinit_error=True,
             local_mode=True)

    model_config_file_path, _ = maybe_download_object(
        storage_client=storage_client,
        bucket_name=BUCKET_NAME,
        object_name=MODEL_CONFIG_KEY,
        force_download=False)

    with open(model_config_file_path, 'r') as config_file:
        model_config = json.load(fp=config_file)

    example_env = PokerMultiAgentEnv(env_config=POKER_ENV_CONFIG)

    logger.info("\n\n\n\n\n__________________________________________\n"
                f"LAUNCHED FOR {POKER_GAME_VERSION}\n"
                f"__________________________________________\n\n\n\n\n")

    obs_space = example_env.observation_space
    act_space = example_env.action_space

    preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space)
    graph = tf.Graph()
    sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}),
                      graph=graph)

    def fetch_logits(policy):
        return {
            "behaviour_logits": policy.model.last_output(),
        }

    _policy_cls = POLICY_CLASS.with_updates(
        extra_action_fetches_fn=fetch_logits)

    with graph.as_default():
        with sess.as_default():
            policy = _policy_cls(obs_space=preprocessor.observation_space,
                                 action_space=act_space,
                                 config=with_common_config({
                                     'model':
                                     with_base_config(
                                         base_config=MODEL_DEFAULTS,
                                         extra_config=model_config),
                                     'env':
                                     POKER_ENV,
                                     'env_config':
                                     POKER_ENV_CONFIG,
                                     'custom_preprocessor':
                                     STRATEGO_PREPROCESSOR
                                 }))

    def set_policy_weights(weights_key):
        weights_file_path, _ = maybe_download_object(
            storage_client=storage_client,
            bucket_name=BUCKET_NAME,
            object_name=weights_key,
            force_download=False)
        policy.load_model_weights(weights_file_path)

    print("(Started Successfully)")

    last_payoff_table_key = None
    while True:
        payoff_table, payoff_table_key = manager_interface.get_latest_payoff_table(
            infinite_retry_on_error=True)
        if payoff_table_key == last_payoff_table_key:
            time.sleep(20)
            continue
        last_payoff_table_key = payoff_table_key

        metanash_probs, _, _ = get_fp_metanash_for_latest_payoff_table(
            manager_interface=manager_interface,
            fp_iters=20000,
            accepted_opponent_policy_class_names=[POLICY_CLASS_NAME],
            accepted_opponent_model_config_keys=[POKER_ENV_CONFIG],
            add_payoff_matrix_noise_std_dev=0.000,
            mix_with_uniform_dist_coeff=None,
            p_or_lower_rounds_to_zero=0.0)

        if metanash_probs is not None:
            policy_weights_keys = payoff_table.get_ordered_keys_in_payoff_matrix(
            )

            policy_dict = {
                key: prob
                for key, prob in zip(policy_weights_keys, metanash_probs)
            }

            exploitabilitly = measure_exploitability_nonlstm(
                rllib_policy=policy,
                poker_game_version=POKER_GAME_VERSION,
                policy_mixture_dict=policy_dict,
                set_policy_weights_fn=set_policy_weights)
            print(f"Exploitability: {exploitabilitly}")
        def init_static_policy_distribution_after_trainer_init_callback(
                trainer):
            trainer.storage_client = connect_storage_client()

            logger.info("Initializing trainer manager interface")
            trainer.manager_interface = LearnerManagerInterface(
                server_host=MANAGER_SEVER_HOST,
                port=MANAGER_PORT,
                worker_id=full_experiment_name,
                storage_client=trainer.storage_client,
                minio_bucket_name=BUCKET_NAME)

            logger.info("Initializing trainer lock server interface")
            trainer.lock_server_interface = LockServerInterface(
                server_host=LOCK_SERVER_HOST,
                port=LOCK_SERVER_PORT,
                worker_id=full_experiment_name)

            orig_selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table(
                manager_interface=trainer.manager_interface,
                fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
                accepted_opponent_policy_class_names=
                ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
                accepted_opponent_model_config_keys=
                ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
                add_payoff_matrix_noise_std_dev=0.0,
                mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)

            if orig_selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                selection_probs = None
                print(
                    "Payoff table is empty so using random weights for static policy."
                )
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Original Selection Probs: {orig_selection_probs}")

                policy_key_to_leave_out = get_unreserved_policy_key_with_priorities(
                    lock_server_interface=trainer.lock_server_interface,
                    policy_keys=payoff_table.get_ordered_keys_in_payoff_matrix(
                    ),
                    policy_priorities=orig_selection_probs)

                if policy_key_to_leave_out is None:
                    selection_probs = orig_selection_probs
                    print(
                        "No policy keys available to reserve so using unaltered selection probs"
                    )
                else:
                    chosen_policy_selection_prob = orig_selection_probs[
                        payoff_table.get_policy_spec_for_key(
                            policy_key_to_leave_out).get_payoff_matrix_index()]
                    print(
                        f"\n\nLeaving out {policy_key_to_leave_out}\n"
                        f"(Had selection prob of ({chosen_policy_selection_prob})\n\n"
                    )

                    selection_probs = get_fp_metanash_for_payoff_table(
                        payoff_table=payoff_table,
                        fp_iters=METANASH_FICTITIOUS_PLAY_ITERS,
                        accepted_opponent_policy_class_names=
                        ACCEPTED_OPPONENT_POLICY_CLASS_NAMES,
                        accepted_opponent_model_config_keys=
                        ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS,
                        add_payoff_matrix_noise_std_dev=0.0,
                        leave_out_indexes=[
                            payoff_table.get_policy_spec_for_key(
                                policy_key_to_leave_out).
                            get_payoff_matrix_index()
                        ],
                        mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF)
                    print(f"Subset Selection Probs: {selection_probs}")

            if selection_probs is None:
                assert payoff_table is None
                assert payoff_table_key is None
                print(
                    "Payoff table is empty so using random weights for static policy."
                )
            else:
                print(f"Payoff table loaded from {payoff_table_key}")
                print(f"Policy selection probs: {selection_probs}")

            payoff_table_dill_str = dill.dumps(payoff_table)

            def worker_set_static_policy_distribution(worker):
                worker.policy_map[
                    STATIC_POLICY].static_policy_selection_probs = selection_probs
                worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(
                    payoff_table_dill_str)
                worker.policy_map[STATIC_POLICY].current_policy_key = None

            trainer.workers.foreach_worker(
                worker_set_static_policy_distribution)
Exemplo n.º 16
0
SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS = POKER_SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS
SUBMISSION_THRESHOLD_STEPS_START = POKER_SUBMISSION_THRESHOLD_STEPS_START
SUBMISSION_MIN_TIMESTEPS = POKER_SUBMISSION_MIN_TIMESTEPS
SUBMISSION_MAX_TIMESTEPS = POKER_SUBMISSION_MAX_TIMESTEPS
CLOUD_PREFIX = os.getenv("CLOUD_PREFIX", "")

if __name__ == "__main__":
    expected_payoff_matrix_size = 0

    while True:
        logging.basicConfig(level=logging.DEBUG)
        logger.info("\n\n\n\n\n__________________________________________\n"
                    f"LAUNCHED FOR {POKER_GAME_VERSION}\n"
                    f"__________________________________________\n\n\n\n\n")

        storage_client = connect_storage_client()

        size_checker = ConsoleManagerInterface(
            server_host=MANAGER_SEVER_HOST,
            port=MANAGER_PORT,
            worker_id=f"size_checker_{gethostname()}_pid_{os.getpid()}",
            storage_client=storage_client,
            minio_bucket_name=BUCKET_NAME)

        while True:
            current_payoff_matrix_size = size_checker.get_size_of_current_payoff_table(
            )
            if current_payoff_matrix_size < expected_payoff_matrix_size:
                logger.info(
                    f"waiting for payoff matrix to reach size {expected_payoff_matrix_size} (currently {current_payoff_matrix_size})..."
                )
Exemplo n.º 17
0
def perform_eval_matchups_as_they_are_available(i):
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.DEBUG)

    # if os.getenv("EVALUATOR_USE_GPU") == 'true':
    #     os.environ['CUDA_VISIBLE_DEVICES'] = str(i % len(''.join(i for i in os.environ['CUDA_VISIBLE_DEVICES'] if i.isdigit())))

    storage_client = connect_storage_client()

    worker_id = f"evaluator_{gethostname()}_pid_{os.getpid()}_{datetime_str()}"

    manager_interface = EvaluatorManagerInterface(
        server_host=MANAGER_SERVER_HOST,
        port=MANAGER_PORT,
        worker_id=worker_id,
        storage_client=storage_client,
        minio_bucket_name=BUCKET_NAME,
        minio_local_dir=DEFAULT_LOCAL_SAVE_PATH)

    logger.info(f"Started worker \'{worker_id}\'")

    env = ENV_CLASS(env_config=POKER_ENV_CONFIG)

    while True:
        matchup = manager_interface.get_eval_matchup(
            infinite_retry_on_error=True)
        if matchup is None:
            # no matchups available right now, wait a bit and try again
            time.sleep(WAIT_SECONDS_BEFORE_TRYING_AGAIN_IF_NO_MATCHUPS)
            continue

        logger.info(
            f"[{worker_id}] Evaluating Matchup:\n{pretty_print(matchup)}")

        as_policy: PolicySpec = matchup['as_policy']
        against_policy: PolicySpec = matchup['against_policy']
        num_games_to_play = matchup['num_games']

        get_as_policy_fn = make_get_policy_fn(
            model_weights_object_key=as_policy.key,
            model_config_object_key=as_policy.config_key,
            policy_name=as_policy.key,
            policy_class_name=as_policy.class_name,
            storage_client=storage_client,
            minio_bucket_name=BUCKET_NAME,
            download_lock=download_lock,
            manual_config=None)

        get_against_policy_fn = make_get_policy_fn(
            model_weights_object_key=against_policy.key,
            model_config_object_key=against_policy.config_key,
            policy_name=against_policy.key,
            policy_class_name=against_policy.class_name,
            storage_client=storage_client,
            minio_bucket_name=BUCKET_NAME,
            download_lock=download_lock,
            manual_config=None)

        as_policy_payoff, tie_percentage = eval_policy_matchup(
            get_policy_fn_a=get_as_policy_fn,
            get_policy_fn_b=get_against_policy_fn,
            env=env,
            stratego_env_config=POKER_ENV_CONFIG,
            games_per_matchup=num_games_to_play)

        logger.info(
            f"\n\nFinal Result for {as_policy.key}\nvs\n{against_policy.key}\n{as_policy_payoff}\n\n"
        )

        try:
            manager_interface.submit_eval_matchup_result(
                as_policy_key=as_policy.key,
                against_policy_key=against_policy.key,
                as_policy_avg_payoff=as_policy_payoff,
                games_played=num_games_to_play,
                infinite_retry_on_error=True)
        except FalseConfirmationError as err:
            logger.warning(
                f"[{worker_id}] Got False confirmation from manager:\n{err}")
Exemplo n.º 18
0
        def stop_and_submit_if_not_improving_on_train_result_callback(params):
            trainer = params['trainer']
            result = params['result']
            result['stop_signal'] = False

            should_submit = False
            submit_reason = None

            if not hasattr(trainer, 'previous_threshold_check_reward'):
                trainer.previous_threshold_check_reward = -100.0
                trainer.next_threshold_check_timesteps = SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS + SUBMISSION_THRESHOLD_STEPS_START
                print(
                    f"fist threshold check at {trainer.next_threshold_check_timesteps} timesteps"
                )

            if result['timesteps_total'] >= SUBMISSION_THRESHOLD_STEPS_START and \
                    SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS is not None and \
                    SUBMISSION_IMPROVEMENT_THRESHOLD_REWARD is not None:

                if result[
                        'timesteps_total'] >= trainer.next_threshold_check_timesteps:
                    trainer.next_threshold_check_timesteps = max(
                        trainer.next_threshold_check_timesteps +
                        SUBMISSION_IMPROVEMENT_THRESHOLD_PER_TIMESTEPS,
                        result['timesteps_total'] + 1)

                    target_reward = trainer.previous_threshold_check_reward + SUBMISSION_IMPROVEMENT_THRESHOLD_REWARD
                    result['target_reward'] = target_reward
                    measured_reward = result['policy_reward_mean'][
                        TRAIN_POLICY]
                    print(
                        f"{result['timesteps_total']} timesteps: {TRAIN_POLICY} reward: {measured_reward}, target reward: {target_reward}"
                    )

                    if measured_reward < target_reward and \
                            (SUBMISSION_MIN_TIMESTEPS is None or result['timesteps_total'] >= SUBMISSION_MIN_TIMESTEPS):
                        should_submit = True
                        submit_reason = f"plateaued at {measured_reward} reward"
                        print(
                            f"{result['timesteps_total']} timesteps: {TRAIN_POLICY} didn\'t reach target reward. Submitting policy."
                        )
                    else:
                        print(
                            f"next threshold check at {trainer.next_threshold_check_timesteps} timesteps"
                        )

                    trainer.previous_threshold_check_reward = measured_reward

            if SUBMISSION_MAX_TIMESTEPS is not None and result[
                    'timesteps_total'] >= SUBMISSION_MAX_TIMESTEPS:
                should_submit = True
                submit_reason = f"hit max timesteps of {SUBMISSION_MAX_TIMESTEPS}"
                print(f"Trainer hit max timesteps. Submitting policy.")

            if should_submit:
                assert submit_reason is not None
                result['stop_signal'] = True
                local_train_policy = trainer.workers.local_worker(
                ).policy_map[TRAIN_POLICY]

                tags = [
                    *SUBMISSION_POLICY_TAGS, submit_reason,
                    f"timesteps: {result['timesteps_total']}",
                    f"episodes: {result['episodes_total']}"
                ]
                if hasattr(local_train_policy, "init_tag"):
                    tags += local_train_policy.init_tag

                checkpoints_dir = os.path.join(experiment_save_dir,
                                               "policy_submissions")
                checkpoint_name = f"{datetime_str()}_iter_{result['training_iteration']}.dill"
                checkpoint_save_path = os.path.join(checkpoints_dir,
                                                    checkpoint_name)
                local_train_policy.save_model_weights(
                    save_file_path=checkpoint_save_path,
                    remove_scope_prefix=TRAIN_POLICY)
                policy_key = os.path.join(base_experiment_name,
                                          full_experiment_name,
                                          "policy_submissions",
                                          checkpoint_name)
                storage_client = connect_storage_client()
                upload_file(storage_client=storage_client,
                            bucket_name=BUCKET_NAME,
                            object_key=policy_key,
                            local_source_path=checkpoint_save_path)
                trainer.manager_interface.submit_new_policy_for_population(
                    policy_weights_key=policy_key,
                    policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY,
                    policy_class_name=TRAIN_POLICY_CLASS.__name__,
                    policy_tags=tags)
Exemplo n.º 19
0
        def submit_ocassionaly_on_train_result_callback(params):
            trainer = params['trainer']
            result = params['result']

            should_submit = False
            submit_reason = None

            if not hasattr(trainer, 'next_submit'):
                trainer.next_submit = SUBMISSION_IMPROVEMENT_THRESHOLD_PER_STEPS + SUBMISSION_THRESHOLD_STEPS_START

            if result['timesteps_total'] >= trainer.next_submit:
                trainer.next_submit = max(
                    trainer.next_submit +
                    SUBMISSION_IMPROVEMENT_THRESHOLD_PER_STEPS +
                    SUBMISSION_THRESHOLD_STEPS_START,
                    result['timesteps_total'] + 1)

                if SUBMISSION_MIN_STEPS is None or result[
                        'timesteps_total'] >= SUBMISSION_MIN_STEPS:
                    should_submit = True
                    submit_reason = f"periodic_checkpoint"
                    print(
                        colored(
                            f"{result['timesteps_total']} steps: {TRAIN_POLICY} didn\'t reach target reward. Submitting policy.",
                            "white"))
                else:
                    print(
                        colored(f"next submit at {trainer.next_submit} steps",
                                "white"))

            if should_submit:
                assert submit_reason is not None
                local_train_policy = trainer.workers.local_worker(
                ).policy_map[TRAIN_POLICY]

                tags = [
                    *SUBMISSION_POLICY_TAGS, submit_reason,
                    f"timesteps: {result['timesteps_total']}",
                    f"episodes: {result['episodes_total']}",
                    f"iter: {result['training_iteration']}"
                ]
                if hasattr(local_train_policy, "init_tag"):
                    tags += local_train_policy.init_tag

                checkpoints_dir = os.path.join(experiment_save_dir,
                                               "policy_submissions")
                checkpoint_name = f"{datetime_str()}_iter_{result['training_iteration']}.dill"
                checkpoint_save_path = os.path.join(checkpoints_dir,
                                                    checkpoint_name)
                local_train_policy.save_model_weights(
                    save_file_path=checkpoint_save_path,
                    remove_scope_prefix=TRAIN_POLICY)
                policy_key = os.path.join(base_experiment_name,
                                          full_experiment_name,
                                          "policy_submissions",
                                          checkpoint_name)
                storage_client = connect_storage_client()
                upload_file(storage_client=storage_client,
                            bucket_name=BUCKET_NAME,
                            object_key=policy_key,
                            local_source_path=checkpoint_save_path)
                trainer.manager_interface.submit_new_policy_for_population(
                    policy_weights_key=policy_key,
                    policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY,
                    policy_class_name=TRAIN_POLICY_CLASS.__name__,
                    policy_tags=tags)