def __init__(self, minio_bucket, manager_host, manager_port, lock_server_host, lock_server_port, worker_id, policy_class_name, policy_config_key, provide_payoff_barrier_sync=False): worker_id = f"live_pop_tracker_{worker_id[worker_id.find('pid'):]}" self._storage_client = connect_storage_client() self._minio_bucket = minio_bucket self._manager_interface = ConsoleManagerInterface( server_host=manager_host, port=manager_port, worker_id=worker_id, storage_client=self._storage_client, minio_bucket_name=self._minio_bucket) self._lock_interface = LockServerInterface( server_host=lock_server_host, port=lock_server_port, worker_id=worker_id) self._policy_class_name = policy_class_name self._policy_config_key = policy_config_key self._claimed_policy_num = None self._claim_new_active_policy() assert self._claimed_policy_num is not None self._locally_cached_matchup_results = {} self._provide_payoff_barrier_sync = provide_payoff_barrier_sync if self._provide_payoff_barrier_sync: self._wait_at_payoff_table_barrier_fn, self._leave_barrier_group_fn = self._lock_interface.join_barrier_group( barrier_name="pt_barrier", member_name=str(self._claimed_policy_num), grace_period_for_others_to_join_s=20.0) else: self._wait_at_payoff_table_barrier_fn = None self._leave_barrier_group_fn = None
CLOUD_PREFIX = os.getenv("CLOUD_PREFIX", "") if __name__ == "__main__": expected_payoff_matrix_size = 0 while True: logging.basicConfig(level=logging.DEBUG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") storage_client = connect_storage_client() size_checker = ConsoleManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=f"size_checker_{gethostname()}_pid_{os.getpid()}", storage_client=storage_client, minio_bucket_name=BUCKET_NAME) while True: current_payoff_matrix_size = size_checker.get_size_of_current_payoff_table( ) if current_payoff_matrix_size < expected_payoff_matrix_size: logger.info( f"waiting for payoff matrix to reach size {expected_payoff_matrix_size} (currently {current_payoff_matrix_size})..." ) time.sleep(5) # elif current_payoff_matrix_size > expected_payoff_matrix_size: # raise ValueError(f"payoff matrix is now larger than expected (expected {expected_payoff_matrix_size}, currently {current_payoff_matrix_size})") else: break
CLOUD_PREFIX = os.getenv("CLOUD_PREFIX", "") if __name__ == "__main__": new_learner_wait_for_key_in_payoff_table = None while True: logging.basicConfig(level=logging.DEBUG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") storage_client = connect_storage_client() size_checker = ConsoleManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=f"size_checker_{gethostname()}_pid_{os.getpid()}", storage_client=storage_client, minio_bucket_name=BUCKET_NAME) while True: if new_learner_wait_for_key_in_payoff_table is not None: if not size_checker.is_policy_key_in_current_payoff_matrix( policy_key=new_learner_wait_for_key_in_payoff_table): logger.info( f"waiting for payoff matrix to include the policy key {new_learner_wait_for_key_in_payoff_table} " ) time.sleep(5) else: break else: break
class LivePolicyPayoffTracker(object): def __init__(self, minio_bucket, manager_host, manager_port, lock_server_host, lock_server_port, worker_id, policy_class_name, policy_config_key, provide_payoff_barrier_sync=False): worker_id = f"live_pop_tracker_{worker_id[worker_id.find('pid'):]}" self._storage_client = connect_storage_client() self._minio_bucket = minio_bucket self._manager_interface = ConsoleManagerInterface( server_host=manager_host, port=manager_port, worker_id=worker_id, storage_client=self._storage_client, minio_bucket_name=self._minio_bucket) self._lock_interface = LockServerInterface( server_host=lock_server_host, port=lock_server_port, worker_id=worker_id) self._policy_class_name = policy_class_name self._policy_config_key = policy_config_key self._claimed_policy_num = None self._claim_new_active_policy() assert self._claimed_policy_num is not None self._locally_cached_matchup_results = {} self._provide_payoff_barrier_sync = provide_payoff_barrier_sync if self._provide_payoff_barrier_sync: self._wait_at_payoff_table_barrier_fn, self._leave_barrier_group_fn = self._lock_interface.join_barrier_group( barrier_name="pt_barrier", member_name=str(self._claimed_policy_num), grace_period_for_others_to_join_s=20.0) else: self._wait_at_payoff_table_barrier_fn = None self._leave_barrier_group_fn = None @ray.method(num_return_vals=1) def wait_at_barrier_for_other_learners(self): assert self._provide_payoff_barrier_sync self._wait_at_payoff_table_barrier_fn() return True @ray.method(num_return_vals=1) def set_latest_key_for_claimed_policy( self, new_key, request_locks_checkpoint_with_name=None): assert self._claimed_policy_num is not None prefix = f"policy {self._claimed_policy_num} latest key: " new_lock = prefix + new_key policy_key_locks = self._lock_interface.get_all_items( filter_by_string=prefix) if len(policy_key_locks) > 0: assert len(policy_key_locks) == 1 old_lock = policy_key_locks[0] assert self._lock_interface.replace_item( old_item=old_lock, new_item=new_lock, new_item_remains_after_disconnect=True, request_locks_checkpoint_with_name= request_locks_checkpoint_with_name) print( colored( f"Policy {self._claimed_policy_num}: Set new latest key for claimed policy (replaced old one): \"{new_lock}\"", 'green')) else: assert self._lock_interface.try_to_reserve_item( item_name=new_lock, remain_after_worker_disconnect=True, request_locks_checkpoint_with_name= request_locks_checkpoint_with_name) print( colored( f"Policy {self._claimed_policy_num}: Set new latest key for claimed policy: \"{new_lock}\"", "green")) return True @ray.method(num_return_vals=1) def set_claimed_policy_as_finished(self): old_lock = f"policy_status: {self._claimed_policy_num} active" new_lock = f"policy_status: {self._claimed_policy_num} finished" assert self._lock_interface.replace_item( old_item=old_lock, new_item=new_lock, new_item_remains_after_disconnect=True) print( colored( f"Policy {self._claimed_policy_num}: Set claimed policy as finished: \"{new_lock}\"", "green")) if self._leave_barrier_group_fn is not None: self._leave_barrier_group_fn() return True @ray.method(num_return_vals=2) def get_live_payoff_table_dill_pickled(self, first_wait_for_n_seconds=None): if first_wait_for_n_seconds is not None: time.sleep(first_wait_for_n_seconds) base_payoff_table, _ = self._manager_interface.get_latest_payoff_table( infinite_retry_on_error=False) if base_payoff_table is None: base_payoff_table = PayoffTable() base_payoff_table: PayoffTable = base_payoff_table active_policy_numbers, finished_policy_numbers, total_policy_numbers = self._get_active_and_finished_policy_numbers( ) assert len(active_policy_numbers) + len( finished_policy_numbers) == total_policy_numbers are_all_lower_policies_finished = len(active_policy_numbers) == 0 print( colored( f"Policy {self._claimed_policy_num}: Latest live stats for policies below this learner: {len(finished_policy_numbers)} policies finished, {len(active_policy_numbers)} active.", "cyan")) if total_policy_numbers == 0: return None, are_all_lower_policies_finished assert base_payoff_table.size() <= len( finished_policy_numbers) or base_payoff_table.size() == 1 missing_policy_nums = list( range(base_payoff_table.size(), total_policy_numbers)) for missing_policy_num in missing_policy_nums: missing_key = self._get_latest_key_for_policy_number( policy_num=missing_policy_num) if missing_key is None: time.sleep(5) missing_key = self._get_latest_key_for_policy_number( policy_num=missing_policy_num) if missing_key is not None: base_payoff_table.add_policy( new_policy_key=missing_key, new_policy_class_name=self._policy_class_name, new_policy_config_file_key=self._policy_config_key, new_policy_tags=['locally_tracked']) required_evals_observed = set() required_evals_finalized = set() while True: matchup_order = base_payoff_table.get_eval_matchup_order() if matchup_order is None: break if matchup_order not in required_evals_finalized: as_policy_key, against_policy_key = matchup_order payoff, games_played = self._check_eval_cache( as_policy_key=as_policy_key, against_policy_key=against_policy_key) if payoff is None: payoff, games_played = self._manager_interface.request_eval_result( as_policy_key=as_policy_key, as_policy_config_key=self._policy_config_key, as_policy_class_name=self._policy_class_name, against_policy_key=against_policy_key, against_policy_config_key=self._policy_config_key, against_policy_class_name=self._policy_class_name, perform_eval_if_not_cached=matchup_order not in required_evals_observed, infinite_retry_on_error=False) if payoff is not None and matchup_order not in required_evals_observed: print( f"{colored(f'Policy {self._claimed_policy_num}: !!!! GOT A CACHE HIT FROM THE MANAGER !!!!','yellow')}\n" f"{colored(f'for {as_policy_key} vs {against_policy_key}', 'yellow')}" ) if payoff is None and matchup_order in required_evals_observed: print( colored( f"Policy {self._claimed_policy_num}: Waiting to get eval result for {as_policy_key} vs {against_policy_key}", "yellow")) time.sleep(2) if payoff is not None: self._add_to_eval_cache_if_not_already_entered( as_policy_key=as_policy_key, against_policy_key=against_policy_key, payoff=payoff, games_played=games_played) base_payoff_table.add_eval_result( as_policy_key=as_policy_key, against_policy_key=against_policy_key, payoff=payoff, games_played=games_played) required_evals_finalized.add(matchup_order) required_evals_observed.add(matchup_order) assert len(required_evals_observed) >= len(required_evals_finalized) assert base_payoff_table.get_num_pending_policies( ) == 0, f"amount is {base_payoff_table.get_num_pending_policies()}" assert base_payoff_table.size() == total_policy_numbers return base_payoff_table.to_dill(), are_all_lower_policies_finished #################################################################################################################### @ray.method(num_return_vals=1) def my_get_live_payoff_table_dill_pickled(self, new_weight_key=None, first_wait_for_n_seconds=None): if first_wait_for_n_seconds is not None: time.sleep(first_wait_for_n_seconds) base_payoff_table, _ = self._manager_interface.get_latest_payoff_table( infinite_retry_on_error=False) if base_payoff_table is None: base_payoff_table = PayoffTable() base_payoff_table: PayoffTable = base_payoff_table active_policy_numbers, finished_policy_numbers, total_policy_numbers = self._get_active_and_finished_policy_numbers( ) assert len(active_policy_numbers) + len( finished_policy_numbers) + 1 == total_policy_numbers + 1 are_all_lower_policies_finished = len(active_policy_numbers) == 0 # print(colored(f"Policy {self._claimed_policy_num}: Latest live stats for policies below this learner: " # f"{len(finished_policy_numbers)} policies finished, {len(active_policy_numbers)} active.", "magenta")) if total_policy_numbers + 1 == 0: return None # Payoff tables smaller than 2x2 are not interesting for diversity computation if base_payoff_table.size() < 3: return None assert base_payoff_table.size() <= len( finished_policy_numbers) or base_payoff_table.size() == 1 missing_policy_nums = list( range(base_payoff_table.size(), total_policy_numbers + 1)) for missing_policy_num in missing_policy_nums: if missing_policy_num < missing_policy_nums[-1]: missing_key = self._get_latest_key_for_policy_number( policy_num=missing_policy_num) else: missing_key = new_weight_key # print(colored(f" My key: {missing_key} ", "magenta")) if missing_key is None: time.sleep(5) missing_key = self._get_latest_key_for_policy_number( policy_num=missing_policy_num) if missing_key is not None: base_payoff_table.add_policy( new_policy_key=missing_key, new_policy_class_name=self._policy_class_name, new_policy_config_file_key=self._policy_config_key, new_policy_tags=['locally_tracked']) required_evals_observed = set() required_evals_finalized = set() while True: matchup_order = base_payoff_table.get_eval_matchup_order() if matchup_order is None: break if matchup_order not in required_evals_finalized: as_policy_key, against_policy_key = matchup_order payoff, games_played = self._check_eval_cache( as_policy_key=as_policy_key, against_policy_key=against_policy_key) if payoff is None: payoff, games_played = self._manager_interface.request_eval_result( as_policy_key=as_policy_key, as_policy_config_key=self._policy_config_key, as_policy_class_name=self._policy_class_name, against_policy_key=against_policy_key, against_policy_config_key=self._policy_config_key, against_policy_class_name=self._policy_class_name, perform_eval_if_not_cached=matchup_order not in required_evals_observed, infinite_retry_on_error=False) if payoff is not None and matchup_order not in required_evals_observed: print( f"{colored(f'Policy {self._claimed_policy_num}: !!!! GOT A CACHE HIT FROM THE MANAGER !!!!','yellow')}\n" f"{colored(f'for {as_policy_key} vs {against_policy_key}', 'yellow')}" ) if payoff is None and matchup_order in required_evals_observed: print( colored( f"Policy {self._claimed_policy_num}: Waiting to get eval result for {as_policy_key} vs {against_policy_key}", "yellow")) time.sleep(2) if payoff is not None: self._add_to_eval_cache_if_not_already_entered( as_policy_key=as_policy_key, against_policy_key=against_policy_key, payoff=payoff, games_played=games_played) base_payoff_table.add_eval_result( as_policy_key=as_policy_key, against_policy_key=against_policy_key, payoff=payoff, games_played=games_played) required_evals_finalized.add(matchup_order) required_evals_observed.add(matchup_order) assert len(required_evals_observed) >= len(required_evals_finalized) assert base_payoff_table.get_num_pending_policies( ) == 0, f"amount is {base_payoff_table.get_num_pending_policies()}" assert base_payoff_table.size( ) == total_policy_numbers + 1, f"base_payoff_table.size() is {base_payoff_table.size()}, total_policy_numbers+1={total_policy_numbers+1}" return base_payoff_table.get_payoff_matrix() #################################################################################################################### @ray.method(num_return_vals=1) def are_all_lower_policies_finished(self): active_policy_numbers, finished_policy_numbers, total_policy_numbers = self._get_active_and_finished_policy_numbers( ) assert len(active_policy_numbers) + len( finished_policy_numbers) == total_policy_numbers return len(active_policy_numbers) == 0 @ray.method(num_return_vals=1) def get_claimed_policy_num(self): return self._claimed_policy_num def _get_active_and_finished_policy_numbers(self): start_time = time.time() while True: policy_status_locks = self._lock_interface.get_all_items( filter_by_string="policy_status: ") if len(policy_status_locks) == 0: return [], [], 0 _, all_policy_numbers, all_policy_statuses = map( list, zip(*[item.split(" ") for item in policy_status_locks])) assert all(stat == "active" or stat == "finished" for stat in all_policy_statuses) num_policies_to_consider = self._claimed_policy_num if self._claimed_policy_num is not None else len( all_policy_numbers) policy_numbers = [None] * num_policies_to_consider policies_active_states = [None] * num_policies_to_consider for policy_num, policy_status in zip(all_policy_numbers, all_policy_statuses): policy_num = int(policy_num) if self._claimed_policy_num is None or policy_num < self._claimed_policy_num: policy_numbers[policy_num] = policy_num policies_active_states[policy_num] = ( policy_status == "active") if not all(p is not None for p in policy_numbers): if time.time() - start_time > 60: raise ValueError( colored( f"policy_numbers (some are None): {policy_numbers}", "red")) print( colored( f"policy_numbers (some are None), trying again: {policy_numbers}", "red")) time.sleep(0.5) continue assert all(p is not None for p in policies_active_states) assert _check_consecutive_numbers( int_list=policy_numbers, should_start_at=0 ), f"policy_numbers is {policy_numbers}, all policy status locks are {policy_status_locks}" assert _check_only_latest_policies_are_active( policies_active_states=policies_active_states) break active_policy_numbers = [] finished_policy_numbers = [] for i, policy_number in enumerate(policy_numbers): if policies_active_states[i]: active_policy_numbers.append(policy_number) else: finished_policy_numbers.append(policy_number) total_policy_numbers = len(policy_numbers) return active_policy_numbers, finished_policy_numbers, total_policy_numbers def _claim_new_active_policy(self): if self._claimed_policy_num is not None: raise ValueError( f"This interface has already claimed policy {self._claimed_policy_num}" ) _, _, total_policy_numbers = self._get_active_and_finished_policy_numbers( ) claimed_policy_key = self._lock_interface.try_to_reserve_item_from_list( possible_item_names_in_order_of_highest_priority_first=[ f"policy_status: {i} active" for i in range(total_policy_numbers, total_policy_numbers + 100) ]) claimed_policy_num = int( claimed_policy_key.replace('policy_status: ', '').replace(' active', '')) assert claimed_policy_num is not None print(colored(f"Claimed Policy {claimed_policy_num}", "green")) self._claimed_policy_num = claimed_policy_num return claimed_policy_num def _get_latest_key_for_policy_number(self, policy_num): prefix = f"policy {policy_num} latest key: " policy_key_locks = self._lock_interface.get_all_items( filter_by_string=prefix) if len(policy_key_locks) == 0: return None assert len(policy_key_locks) == 1 policy_key = policy_key_locks[0][len(prefix):] return policy_key def _check_eval_cache(self, as_policy_key, against_policy_key): payoff, games_played = None, None try: payoff, games_played = self._locally_cached_matchup_results[ as_policy_key][against_policy_key] print( colored( f"Eval Cache Hit for \"{as_policy_key}\" vs \"{against_policy_key}\"", "green")) except KeyError: try: payoff, games_played = self._locally_cached_matchup_results[ against_policy_key][as_policy_key] payoff = -payoff print( colored( f"Eval Cache Hit for \"{against_policy_key}\" vs \"{as_policy_key}\"", "green")) except KeyError: pass return payoff, games_played def _add_to_eval_cache_if_not_already_entered(self, as_policy_key, against_policy_key, payoff, games_played): old_payoff, _ = self._check_eval_cache( as_policy_key=as_policy_key, against_policy_key=against_policy_key) if old_payoff is not None: return if as_policy_key not in self._locally_cached_matchup_results: self._locally_cached_matchup_results[as_policy_key] = {} if against_policy_key not in self._locally_cached_matchup_results[ as_policy_key]: self._locally_cached_matchup_results[as_policy_key][ against_policy_key] = (payoff, games_played)
def measure_exploitability_of_metanashes_as_they_become_available(): logger = get_logger() storage_client = connect_storage_client() worker_id = f"Exploitability_Tracker_{gethostname()}_pid_{os.getpid()}_{datetime_str()}" manager_interface = ConsoleManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=worker_id, storage_client=storage_client, minio_bucket_name=BUCKET_NAME, minio_local_dir=DEFAULT_LOCAL_SAVE_PATH) logger.info(f"Started worker \'{worker_id}\'") # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main() ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True, local_mode=True) model_config_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=MODEL_CONFIG_KEY, force_download=False) with open(model_config_file_path, 'r') as config_file: model_config = json.load(fp=config_file) example_env = PokerMultiAgentEnv(env_config=POKER_ENV_CONFIG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") obs_space = example_env.observation_space act_space = example_env.action_space preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space) graph = tf.Graph() sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}), graph=graph) def fetch_logits(policy): return { "behaviour_logits": policy.model.last_output(), } _policy_cls = POLICY_CLASS.with_updates( extra_action_fetches_fn=fetch_logits) with graph.as_default(): with sess.as_default(): policy = _policy_cls(obs_space=preprocessor.observation_space, action_space=act_space, config=with_common_config({ 'model': with_base_config( base_config=MODEL_DEFAULTS, extra_config=model_config), 'env': POKER_ENV, 'env_config': POKER_ENV_CONFIG, 'custom_preprocessor': STRATEGO_PREPROCESSOR })) def set_policy_weights(weights_key): weights_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=weights_key, force_download=False) policy.load_model_weights(weights_file_path) print("(Started Successfully)") last_payoff_table_key = None while True: payoff_table, payoff_table_key = manager_interface.get_latest_payoff_table( infinite_retry_on_error=True) if payoff_table_key == last_payoff_table_key: time.sleep(20) continue last_payoff_table_key = payoff_table_key metanash_probs, _, _ = get_fp_metanash_for_latest_payoff_table( manager_interface=manager_interface, fp_iters=20000, accepted_opponent_policy_class_names=[POLICY_CLASS_NAME], accepted_opponent_model_config_keys=[POKER_ENV_CONFIG], add_payoff_matrix_noise_std_dev=0.000, mix_with_uniform_dist_coeff=None, p_or_lower_rounds_to_zero=0.0) if metanash_probs is not None: policy_weights_keys = payoff_table.get_ordered_keys_in_payoff_matrix( ) policy_dict = { key: prob for key, prob in zip(policy_weights_keys, metanash_probs) } exploitabilitly = measure_exploitability_nonlstm( rllib_policy=policy, poker_game_version=POKER_GAME_VERSION, policy_mixture_dict=policy_dict, set_policy_weights_fn=set_policy_weights) print(f"Exploitability: {exploitabilitly}")
CANT_SUBMIT_UNTIL_LOWER_POLICIES_FINISH = True INIT_FROM_POPULATION = POKER_PIPELINE_INIT_FROM_POP CLOUD_PREFIX = os.getenv("CLOUD_PREFIX", "") if __name__ == "__main__": while True: logging.basicConfig(level=logging.DEBUG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") storage_client = connect_storage_client() size_checker = ConsoleManagerInterface( server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id=f"size_checker_{gethostname()}_pid_{os.getpid()}", storage_client=storage_client, minio_bucket_name=BUCKET_NAME) ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True, log_to_driver=True) logger.info("Ray Web UI at {}".format(ray.get_webui_url())) base_experiment_name = f"{CLOUD_PREFIX}learner_{POKER_GAME_VERSION}_sac_arch1_pipeline_psro" full_experiment_name = f"{base_experiment_name}_{gethostname()}_pid_{os.getpid()}_{datetime_str()}" experiment_save_dir = os.path.join(DEFAULT_RESULTS_DIR, full_experiment_name) def claim_new_active_policy_after_trainer_init_callback(trainer): def set_train_policy_warmup_target_entropy_proportion(worker):
JOB_STATUS_OPEN = 'open' JOB_STATUS_ACTIVE = 'active' def _job_str_for_policy_key(policy_key): return f"job: {policy_key} {JOB_STATUS_OPEN}" if __name__ == '__main__': storage_client = connect_storage_client() manager_interface = ConsoleManagerInterface( server_host=MANAGER_SERVER_HOST, port=MANAGER_PORT, worker_id= f"rectified_psro_job_scheduler_{gethostname()}_pid_{os.getpid()}", storage_client=storage_client, minio_bucket_name=BUCKET_NAME) lock_server_interface = LockServerInterface( server_host=LOCK_SERVER_HOST, port=LOCK_SERVER_PORT, worker_id= f"rectified_psro_job_scheduler_{gethostname()}_pid_{os.getpid()}") generation_index = 0 while True: print(f"Starting generation {generation_index}") active_job_list = [] active_job_policy_keys = []