def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table( manager_interface=trainer.manager_interface, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names=ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF ) if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print("Payoff table is empty so using random weights for static policy.") else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker(worker_set_static_policy_distribution)
def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table( ) if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print( "Payoff table is empty so using random weights for static policy." ) else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[ STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads( payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker(worker_set_static_policy_distribution)
def claim_new_active_policy_after_trainer_init_callback(trainer): def set_train_policy_warmup_target_entropy_proportion(worker): worker.policy_map[TRAIN_POLICY].set_target_entropy_proportion( PIPELINE_WARMUP_ENTROPY_TARGET_PROPORTION) trainer.workers.foreach_worker( set_train_policy_warmup_target_entropy_proportion) trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) trainer.live_table_tracker = LivePolicyPayoffTracker.remote( minio_endpoint=MINIO_ENDPOINT, minio_access_key=MINIO_ACCESS_KEY, minio_secret_key=MINIO_SECRET_KEY, minio_bucket=BUCKET_NAME, manager_host=MANAGER_SERVER_HOST, manager_port=MANAGER_PORT, lock_server_host=LOCK_SERVER_HOST, lock_server_port=LOCK_SERVER_PORT, worker_id=full_experiment_name, policy_class_name=TRAIN_POLICY_CLASS.__name__, policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY, provide_payoff_barrier_sync= not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS) trainer.claimed_policy_num = ray_get_and_free( trainer.live_table_tracker.get_claimed_policy_num.remote()) trainer.are_all_lower_policies_finished = False trainer.payoff_table_needs_update_started = False trainer.payoff_table = None _do_live_policy_checkpoint(trainer=trainer, training_iteration=0) if not PIPELINE_LIVE_PAYOFF_TABLE_CALC_IS_ASYNCHRONOUS: # wait for all other learners to also reach this point before continuing ray_get_and_free(trainer.live_table_tracker. wait_at_barrier_for_other_learners.remote()) trainer.new_payoff_table_promise = trainer.live_table_tracker.get_live_payoff_table_dill_pickled.remote( first_wait_for_n_seconds=2) _process_new_live_payoff_table_result_if_ready( trainer=trainer, block_until_result_is_ready=True) if INIT_FROM_POPULATION: init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback( trainer=trainer) else: print( colored( f"Policy {trainer.claimed_policy_num}: (Initializing train policy to random)", "white"))
def init_static_policy_distribution_after_trainer_init_callback( trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME)
def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface(server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) trainer.lock_server_interface = LockServerInterface(server_host=LOCK_SERVER_HOST, port=LOCK_SERVER_PORT, worker_id=f"rectified_psro_learner_{gethostname()}_pid_{os.getpid()}") payoff_table, payoff_table_key = trainer.manager_interface.get_latest_payoff_table(infinite_retry_on_error=True) if payoff_table is None: assert job_init_policy_key == 'random' assert payoff_table_key is None selection_probs = None print(colored( f"Payoff table is empty so using random weights for static policy.", "white")) else: assert job_init_policy_key != 'random' policies_str = "" for policy_key in payoff_table.get_ordered_keys_in_payoff_matrix(): policies_str += f"{policy_key}" print(colored( f"Payoff Table Policies: {colored(policies_str, 'white')}\n", "white")) selection_probs = get_rectified_selection_probs_for_policy_key(payoff_table=payoff_table, policy_key=job_init_policy_key, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS) print(colored(f"Rectified Policy selection probs: {selection_probs}", "white")) if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print("Payoff table is empty so using random weights for static policy.") else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads(payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker(worker_set_static_policy_distribution)
def init_static_policy_distribution_after_trainer_init_callback( trainer): trainer.storage_client = connect_storage_client() logger.info("Initializing trainer manager interface") trainer.manager_interface = LearnerManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=trainer.storage_client, minio_bucket_name=BUCKET_NAME) logger.info("Initializing trainer lock server interface") trainer.lock_server_interface = LockServerInterface( server_host=LOCK_SERVER_HOST, port=LOCK_SERVER_PORT, worker_id=full_experiment_name) orig_selection_probs, payoff_table, payoff_table_key = get_fp_metanash_for_latest_payoff_table( manager_interface=trainer.manager_interface, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names= ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys= ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF) if orig_selection_probs is None: assert payoff_table is None assert payoff_table_key is None selection_probs = None print( "Payoff table is empty so using random weights for static policy." ) else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Original Selection Probs: {orig_selection_probs}") policy_key_to_leave_out = get_unreserved_policy_key_with_priorities( lock_server_interface=trainer.lock_server_interface, policy_keys=payoff_table.get_ordered_keys_in_payoff_matrix( ), policy_priorities=orig_selection_probs) if policy_key_to_leave_out is None: selection_probs = orig_selection_probs print( "No policy keys available to reserve so using unaltered selection probs" ) else: chosen_policy_selection_prob = orig_selection_probs[ payoff_table.get_policy_spec_for_key( policy_key_to_leave_out).get_payoff_matrix_index()] print( f"\n\nLeaving out {policy_key_to_leave_out}\n" f"(Had selection prob of ({chosen_policy_selection_prob})\n\n" ) selection_probs = get_fp_metanash_for_payoff_table( payoff_table=payoff_table, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names= ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys= ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, leave_out_indexes=[ payoff_table.get_policy_spec_for_key( policy_key_to_leave_out). get_payoff_matrix_index() ], mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF) print(f"Subset Selection Probs: {selection_probs}") if selection_probs is None: assert payoff_table is None assert payoff_table_key is None print( "Payoff table is empty so using random weights for static policy." ) else: print(f"Payoff table loaded from {payoff_table_key}") print(f"Policy selection probs: {selection_probs}") payoff_table_dill_str = dill.dumps(payoff_table) def worker_set_static_policy_distribution(worker): worker.policy_map[ STATIC_POLICY].static_policy_selection_probs = selection_probs worker.policy_map[STATIC_POLICY].payoff_table = dill.loads( payoff_table_dill_str) worker.policy_map[STATIC_POLICY].current_policy_key = None trainer.workers.foreach_worker( worker_set_static_policy_distribution)
f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") storage_client = connect_storage_client() ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True) logger.info("Ray Web UI at {}".format(ray.get_webui_url())) base_experiment_name = f"{CLOUD_PREFIX}learner_{POKER_GAME_VERSION}_sac_arch1_hparam_search_multexp" full_experiment_name = f"{base_experiment_name}_{gethostname()}_pid_{os.getpid()}_{datetime_str()}" experiment_save_dir = os.path.join(DEFAULT_RESULTS_DIR, full_experiment_name) manager_interface = LearnerManagerInterface(server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=full_experiment_name, storage_client=storage_client, minio_bucket_name=BUCKET_NAME) selection_probs, _, _ = get_fp_metanash_for_latest_payoff_table( manager_interface=manager_interface, fp_iters=METANASH_FICTITIOUS_PLAY_ITERS, accepted_opponent_policy_class_names= ACCEPTED_OPPONENT_POLICY_CLASS_NAMES, accepted_opponent_model_config_keys=ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS, add_payoff_matrix_noise_std_dev=0.0, mix_with_uniform_dist_coeff=PSRO_EXPLORATION_COEFF) def init_static_policy_distribution_after_trainer_init_callback(trainer): trainer.storage_client = connect_storage_client()