def set_policy_weights(weights_key): weights_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=weights_key, force_download=False) policy.load_model_weights(weights_file_path)
def sample_new_static_policy_weights_for_each_worker_on_episode_start( params): policies = params['policy'] static_policy = policies[STATIC_POLICY] if static_policy.static_policy_selection_probs is None: return selected_policy_index = np.random.choice( a=list(range(len( static_policy.static_policy_selection_probs))), p=static_policy.static_policy_selection_probs) selected_policy_spec: PolicySpec = static_policy.payoff_table.get_policy_for_index( selected_policy_index) assert selected_policy_spec.class_name in ACCEPTED_OPPONENT_POLICY_CLASS_NAMES assert selected_policy_spec.config_key in ACCEPTED_OPPONENT_MODEL_CONFIG_KEYS if static_policy.current_policy_key != selected_policy_spec.key: # print(f"sampled policy {selected_policy_spec.key} (loading weights)") storage_client = connect_storage_client() weights_local_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=selected_policy_spec.key, force_download=False) static_policy.load_model_weights( load_file_path=weights_local_path, add_scope_prefix=STATIC_POLICY) static_policy.current_policy_key = selected_policy_spec.key
def get_latest_payoff_table(self, infinite_retry_on_error: bool = True): while True: try: request = Empty() response: PayoffTableKey = self._stub.GetLatestPayoffTableKey( request) break except grpc.RpcError as err: if infinite_retry_on_error: logger.warning( f"grpc.RPCError raised while getting latest payoff table:\n{err}\n" f"(retrying in {_INFINITE_RETRY_INTERVAL_SECONDS} seconds)" ) time.sleep(_INFINITE_RETRY_INTERVAL_SECONDS) else: raise if response.payoff_table_is_empty: logger.debug("Latest payoff table is empty (None)") return None, None payoff_table_local_path, _ = maybe_download_object( storage_client=self._storage_client, bucket_name=self._minio_bucket_name, object_name=response.key, local_directory=self._minio_local_dir, force_download=False) latest_payoff_table = PayoffTable.from_dill_file( dill_file_path=payoff_table_local_path) return latest_payoff_table, response.key
def set_policy_weights(weights_key): print(f"weights are {weights_key}") storage_client = connect_storage_client() weights_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=weights_key, force_download=False) print("got weights") local_exploit_rllib_policy.load_model_weights( weights_file_path, add_scope_prefix=STATIC_POLICY)
def sample_new_policy_weights_from_population(): new_policy_key = np.random.choice(a=list(population_policy_keys_to_selection_probs.keys()), p=list(population_policy_keys_to_selection_probs.values())) if new_policy_key != policy.current_model_weights_key: with download_lock: weights_file_path, _ = maybe_download_object(storage_client=storage_client, bucket_name=minio_bucket_name, object_name=new_policy_key, force_download=False) policy.load_model_weights(weights_file_path) logger.debug(f"Sampling new population weights from {new_policy_key}") policy.current_model_weights_key = new_policy_key
def __init__(self, stop_event, payoff_table_save_key_prefix_dir, storage_client, bucket_name, max_ping_interval_seconds_to_track_workers, num_games_to_play_for_matchup_evals, restore_from_payoff_table_key=None): self._stop_event = stop_event self.payoff_table_save_key_prefix_dir = payoff_table_save_key_prefix_dir self._storage_client = storage_client self._bucket_name = bucket_name self._max_ping_interval_seconds_to_track_workers = max_ping_interval_seconds_to_track_workers self._num_games_to_play_for_matchup_evals = num_games_to_play_for_matchup_evals self._payoff_table_modification_lock = Lock() self._recent_worker_pings = PriorityQueue() self._worker_ping_modification_lock = Lock() self._start_time = time.time() self._eval_matchup_cache_lock = RLock() self._eval_matchup_cache = {} self._externally_requested_eval_queue = Queue() self._recent_eval_match_requests_lock = RLock() self._recent_eval_match_requests = {} self._latest_checkpoint_key = os.path.join( self.payoff_table_save_key_prefix_dir, "latest.dill") logger.info( colored( f"Latest Manager Payoff Table Checkpoint will always be at {self._latest_checkpoint_key} " f"(local file path: {get_default_path_on_disk_for_minio_key(self._latest_checkpoint_key)})", "yellow")) if restore_from_payoff_table_key is not None: payoff_table_local_path, _ = maybe_download_object( storage_client=self._storage_client, bucket_name=self._bucket_name, object_name=restore_from_payoff_table_key, force_download=False) logger.info( f"restoring payoff table from {payoff_table_local_path}") self._payoff_table = PayoffTable.from_dill_file( dill_file_path=payoff_table_local_path) self._latest_payoff_table_key = restore_from_payoff_table_key self._log_policies_in_payoff_matrix() else: logger.info(f"creating new empty payoff table with no policies") self._payoff_table = PayoffTable() self._latest_payoff_table_key = None
def get_weights_by_key(self, policy_key): weights = self.get_from_cache(policy_key=policy_key) if weights is None: load_file_path, _ = maybe_download_object( storage_client=self.storage_client, bucket_name=self.bucket_name, object_name=policy_key) with open(load_file_path, "rb") as dill_file: weights = load(file=dill_file) return weights
def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback(trainer): storage_client = connect_storage_client() weights_local_path, _ = maybe_download_object(storage_client=storage_client, bucket_name=BUCKET_NAME, object_name="learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0/learner_leduc_poker_sac_arch1_psro_sequential_explore_coeff_0.0_sage_pid_29557_11.47.05PM_May-20-2020/policy_submissions/12.00.49AM_May-21-2020_iter_2263.dill", force_download=False) def worker_set_train_policy_weights(worker): train_policy = worker.policy_map[TRAIN_POLICY] train_policy.load_model_weights(load_file_path=weights_local_path, add_scope_prefix=TRAIN_POLICY) trainer.workers.foreach_worker(worker_set_train_policy_weights)
def init_train_policy_weights_from_static_policy_distribution_after_trainer_init_callback( trainer): local_static_policy = trainer.workers.local_worker( ).policy_map[STATIC_POLICY] local_train_policy = trainer.workers.local_worker( ).policy_map[TRAIN_POLICY] if not hasattr(local_static_policy, 'static_policy_selection_probs') or \ local_static_policy.static_policy_selection_probs is None: print( colored( f"Policy {trainer.claimed_policy_num}: Payoff table is empty so Initializing train policy to random", "white")) local_train_policy.init_tag = "init from random" return selected_policy_index = np.random.choice( a=list( range( len(local_static_policy.static_policy_selection_probs)) ), p=local_static_policy.static_policy_selection_probs) selected_policy_spec: PolicySpec = local_static_policy.payoff_table.get_policy_for_index( selected_policy_index) local_train_policy.init_tag = f"full init from {selected_policy_spec.key}" # may not necessarily be true in all scripts assert selected_policy_spec.class_name == TRAIN_POLICY_CLASS.__name__ assert selected_policy_spec.config_key == TRAIN_POLICY_MODEL_CONFIG_KEY storage_client = connect_storage_client() weights_local_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=selected_policy_spec.key, force_download=False) print( colored( f"Policy {trainer.claimed_policy_num}: Initializing train policy to {selected_policy_spec.key}", "white")) # TODO: Here def worker_set_train_policy_weights(worker): train_policy = worker.policy_map[TRAIN_POLICY] train_policy.load_model_weights( load_file_path=weights_local_path, add_scope_prefix=TRAIN_POLICY) trainer.workers.foreach_worker(worker_set_train_policy_weights)
full_experiment_name, "policy_submissions", checkpoint_name) storage_client = connect_storage_client() upload_file(storage_client=storage_client, bucket_name=BUCKET_NAME, object_key=policy_key, local_source_path=checkpoint_save_path) trainer.manager_interface.submit_new_policy_for_population( policy_weights_key=policy_key, policy_config_key=TRAIN_POLICY_MODEL_CONFIG_KEY, policy_class_name=TRAIN_POLICY_CLASS.__name__, policy_tags=tags) train_model_config_local_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=TRAIN_POLICY_MODEL_CONFIG_KEY) with open(train_model_config_local_file_path, 'r') as config_file: train_model_config = json.load(fp=config_file) static_model_config_local_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=STATIC_POLICY_MODEL_CONFIG_KEY) with open(static_model_config_local_file_path, 'r') as config_file: static_model_config = json.load(fp=config_file) def train_policy_mapping_fn(agent_id): if agent_id == 1: return TRAIN_POLICY elif agent_id == 0 or agent_id == -1:
def measure_exploitability_of_metanashes_as_they_become_available(): logger = get_logger() storage_client = connect_storage_client() worker_id = f"Exploitability_Tracker_{gethostname()}_pid_{os.getpid()}_{datetime_str()}" manager_interface = ConsoleManagerInterface( server_host=MANAGER_SEVER_HOST, port=MANAGER_PORT, worker_id=worker_id, storage_client=storage_client, minio_bucket_name=BUCKET_NAME, minio_local_dir=DEFAULT_LOCAL_SAVE_PATH) logger.info(f"Started worker \'{worker_id}\'") # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main() ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True, local_mode=True) model_config_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=MODEL_CONFIG_KEY, force_download=False) with open(model_config_file_path, 'r') as config_file: model_config = json.load(fp=config_file) example_env = PokerMultiAgentEnv(env_config=POKER_ENV_CONFIG) logger.info("\n\n\n\n\n__________________________________________\n" f"LAUNCHED FOR {POKER_GAME_VERSION}\n" f"__________________________________________\n\n\n\n\n") obs_space = example_env.observation_space act_space = example_env.action_space preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space) graph = tf.Graph() sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}), graph=graph) def fetch_logits(policy): return { "behaviour_logits": policy.model.last_output(), } _policy_cls = POLICY_CLASS.with_updates( extra_action_fetches_fn=fetch_logits) with graph.as_default(): with sess.as_default(): policy = _policy_cls(obs_space=preprocessor.observation_space, action_space=act_space, config=with_common_config({ 'model': with_base_config( base_config=MODEL_DEFAULTS, extra_config=model_config), 'env': POKER_ENV, 'env_config': POKER_ENV_CONFIG, 'custom_preprocessor': STRATEGO_PREPROCESSOR })) def set_policy_weights(weights_key): weights_file_path, _ = maybe_download_object( storage_client=storage_client, bucket_name=BUCKET_NAME, object_name=weights_key, force_download=False) policy.load_model_weights(weights_file_path) print("(Started Successfully)") last_payoff_table_key = None while True: payoff_table, payoff_table_key = manager_interface.get_latest_payoff_table( infinite_retry_on_error=True) if payoff_table_key == last_payoff_table_key: time.sleep(20) continue last_payoff_table_key = payoff_table_key metanash_probs, _, _ = get_fp_metanash_for_latest_payoff_table( manager_interface=manager_interface, fp_iters=20000, accepted_opponent_policy_class_names=[POLICY_CLASS_NAME], accepted_opponent_model_config_keys=[POKER_ENV_CONFIG], add_payoff_matrix_noise_std_dev=0.000, mix_with_uniform_dist_coeff=None, p_or_lower_rounds_to_zero=0.0) if metanash_probs is not None: policy_weights_keys = payoff_table.get_ordered_keys_in_payoff_matrix( ) policy_dict = { key: prob for key, prob in zip(policy_weights_keys, metanash_probs) } exploitabilitly = measure_exploitability_nonlstm( rllib_policy=policy, poker_game_version=POKER_GAME_VERSION, policy_mixture_dict=policy_dict, set_policy_weights_fn=set_policy_weights) print(f"Exploitability: {exploitabilitly}")
def get_policy_fn(stratego_env_config): from mprl.utility_services.cloud_storage import maybe_download_object from mprl.rl.sac.sac_policy import SACDiscreteTFPolicy from mprl.rl.ppo.ppo_stratego_model_policy import PPOStrategoModelTFPolicy from mprl.rl.common.stratego_preprocessor import STRATEGO_PREPROCESSOR, StrategoDictFlatteningPreprocessor from ray.rllib.agents.trainer import with_common_config, with_base_config from ray.rllib.models.catalog import MODEL_DEFAULTS from mprl.rl.common.sac_spatial_stratego_model import SAC_SPATIAL_STRATEGO_MODEL import ray from ray.rllib.utils import try_import_tf import json import os tf = try_import_tf() from tensorflow.python.client import device_lib def get_available_gpus(): local_device_protos = device_lib.list_local_devices() return [x.name for x in local_device_protos if x.device_type == 'GPU'] # If you use ray for more than just this single example fn, you'll need to move ray.init to the top of your main() ray.init(address=os.getenv('RAY_HEAD_NODE'), ignore_reinit_error=True, local_mode=True) if policy_class_name == 'PPOStrategoModelTFPolicy': _policy_class = PPOStrategoModelTFPolicy elif policy_class_name == 'SACDiscreteTFPolicy': _policy_class = SACDiscreteTFPolicy else: raise NotImplementedError(f"Eval for policy class \'{policy_class_name}\' not implemented.") if model_config_object_key: with download_lock: model_config_file_path, _ = maybe_download_object(storage_client=storage_client, bucket_name=minio_bucket_name, object_name=model_config_object_key, force_download=False) with open(model_config_file_path, 'r') as config_file: model_config = json.load(fp=config_file) else: model_config = manual_config example_env = stratego_env_config['env_class'](env_config=stratego_env_config) obs_space = example_env.observation_space act_space = example_env.action_space preprocessor = StrategoDictFlatteningPreprocessor(obs_space=obs_space) graph = tf.Graph() if os.getenv("EVALUATOR_USE_GPU") == 'true': gpu = 1 else: gpu = 0 config = tf.ConfigProto(device_count={'GPU': gpu}) if gpu: config.gpu_options.allow_growth = True sess = tf.Session(config=config, graph=graph) with graph.as_default(): with sess.as_default(): policy = _policy_class( obs_space=preprocessor.observation_space, action_space=act_space, config=with_common_config({ 'model': with_base_config(base_config=MODEL_DEFAULTS, extra_config=model_config), 'env': POKER_ENV, 'env_config': stratego_env_config, 'custom_preprocessor': STRATEGO_PREPROCESSOR, })) if model_weights_object_key: with download_lock: weights_file_path, _ = maybe_download_object(storage_client=storage_client, bucket_name=minio_bucket_name, object_name=model_weights_object_key, force_download=False) policy.load_model_weights(weights_file_path) policy.current_model_weights_key = weights_file_path else: policy.current_model_weights_key = None def policy_fn(observation, policy_state=None): if policy_state is None: policy_state = policy.get_initial_state() current_player_perspective_action_index, policy_state, _ = policy.compute_single_action( obs=preprocessor.transform(observation), state=policy_state) return current_player_perspective_action_index, policy_state if population_policy_keys_to_selection_probs is not None: def sample_new_policy_weights_from_population(): new_policy_key = np.random.choice(a=list(population_policy_keys_to_selection_probs.keys()), p=list(population_policy_keys_to_selection_probs.values())) if new_policy_key != policy.current_model_weights_key: with download_lock: weights_file_path, _ = maybe_download_object(storage_client=storage_client, bucket_name=minio_bucket_name, object_name=new_policy_key, force_download=False) policy.load_model_weights(weights_file_path) logger.debug(f"Sampling new population weights from {new_policy_key}") policy.current_model_weights_key = new_policy_key return policy_name, policy_fn, sample_new_policy_weights_from_population # policy name must be unique return policy_name, policy_fn
storage_client = connect_storage_client() manager_host = "localhost" manager_port = 2828 new_manager_interface = LearnerManagerInterface( server_host=manager_host, port=manager_port, worker_id="rebuild_payoff_learner", storage_client=storage_client, minio_bucket_name="stratego") old_payoff_table_local_path, _ = maybe_download_object( storage_client=storage_client, bucket_name="stratego", object_name= "population_server/sage_pid_31932_06_48_20PM_Apr-24-2020/payoff_tables/payoff_table_13_polices_1_pending_sage_pid_31932_07_35_09PM_Apr-25-2020.dill" ) old_payoff_table = PayoffTable.from_dill_file(old_payoff_table_local_path) if input( f"You're about to add a bunch of policies to the manager at {manager_host}:{manager_port}\n" f"Are you sure? Type \'y\' to go through with this: ") != 'y': print("(doing nothing and exiting)") exit(0) for index in range(old_payoff_table.size()): policy: PolicySpec = old_payoff_table.get_policy_for_index(index=index) new_manager_interface.submit_new_policy_for_population(