def training_workflow(config, reporter): from gym.spaces import Box import numpy as np env_maker = get_env_maker(GazeboEnv) env, agents = env_maker(config['env_config'], return_agents=True) space = Box(low=-np.ones(2), high=np.ones(2)) # pdb.set_trace() replay_buffers = { agent_id: ReplayBuffer(config.get('buffer_size', 1000)) for agent_id in agents } policy = { k: (RandomPolicy, a.observation_space, a.action_space, {}) for k, a in agents.items() } worker = RolloutWorker(lambda x: env, policy=policy, batch_steps=32, policy_mapping_fn=lambda x: x, episode_horizon=20) for i in range(config['num_iters']): T1 = SampleBatch.concat_samples([worker.sample()]) for agent_id, batch in T1.policy_batches.items(): for row in batch.rows(): replay_buffers[agent_id].add(row['obs'], row['actions'], row['rewards'], row['new_obs'], row['dones'], weight=None) pdb.set_trace()
def _set_worker_converters(worker: RolloutWorker): worker_delegate_policy = worker.policy_map["delegate_policy"] player_converters = [] for p in range(2): player_converter = RestrictedToBaseGameActionSpaceConverter( delegate_policy=worker_delegate_policy, policy_specs=player_to_base_game_action_specs[p], load_policy_spec_fn=create_get_pure_strat_cached(cache=weights_cache)) player_converters.append(player_converter) worker.foreach_env(lambda env: env.set_action_conversion(p, player_converter)) worker_delegate_policy.player_converters = player_converters
def _set_conversions(worker: RolloutWorker): def _set_restricted_env_convertions(restricted_env): assert isinstance(restricted_env, RestrictedGame) for agent_id, action_policy_specs in agent_id_to_restricted_game_specs.items(): if len(action_policy_specs) > 0: convertor = RestrictedToBaseGameActionSpaceConverter( delegate_policy=worker.policy_map[delegate_policy_id], policy_specs=action_policy_specs, load_policy_spec_fn=load_policy_spec_fn) restricted_env.set_action_conversion(agent_id=agent_id, converter=convertor) worker.foreach_env(_set_restricted_env_convertions)
def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[str, Policy], episode: MultiAgentEpisode, env_index: int, **kwargs): # If using P2SRO, report payoff results of the actively training BR to the payoff table. if not p2sro: return if not hasattr(worker, "p2sro_manager"): worker.p2sro_manager = RemoteP2SROManagerClient(n_players=2, port=psro_manager_port, remote_server_host=psro_manager_host) br_policy_spec: StrategySpec = worker.policy_map["best_response"].policy_spec if br_policy_spec.pure_strat_index_for_player(player=worker.br_player) == 0: # We're training policy 0 if True (first iteration of PSRO). # The PSRO subgame should be empty, and instead the metanash is a random neural network. # No need to report payoff results for this. return # Report payoff results for individual episodes to the p2sro manager to keep a real-time approximation # of the payoff matrix entries for (learning) active policies. policy_specs_for_each_player: List[StrategySpec] = [None, None] payoffs_for_each_player: List[float] = [None, None] for (player, policy_name), reward in episode.agent_rewards.items(): assert policy_name in ["best_response", "metanash"] policy: Policy = worker.policy_map[policy_name] assert policy.policy_spec is not None policy_specs_for_each_player[player] = policy.policy_spec payoffs_for_each_player[player] = reward assert all(payoff is not None for payoff in payoffs_for_each_player) # Send payoff result to the manager for inclusion in the payoff table. worker.p2sro_manager.submit_empirical_payoff_result( policy_specs_for_each_player=tuple(policy_specs_for_each_player), payoffs_for_each_player=tuple(payoffs_for_each_player), games_played=1, override_all_previous_results=False)
def training_workflow(config, reporter): # Setup policy and policy evaluation actors env = gym.make("CartPole-v0") policy = CustomPolicy(env.observation_space, env.action_space, {}) workers = [ RolloutWorker.as_remote().remote( env_creator=lambda c: gym.make("CartPole-v0"), policy=CustomPolicy) for _ in range(config["num_workers"]) ] for _ in range(config["num_iters"]): # Broadcast weights to the policy evaluation workers weights = ray.put({DEFAULT_POLICY_ID: policy.get_weights()}) for w in workers: w.set_weights.remote(weights) # Gather a batch of samples T1 = SampleBatch.concat_samples( ray.get([w.sample.remote() for w in workers])) # Update the remote policy replicas and gather another batch of samples new_value = policy.w * 2.0 for w in workers: w.for_policy.remote(lambda p: p.update_some_value(new_value)) # Gather another batch of samples T2 = SampleBatch.concat_samples( ray.get([w.sample.remote() for w in workers])) # Improve the policy using the T1 batch policy.learn_on_batch(T1) # Do some arbitrary updates based on the T2 batch policy.update_some_value(sum(T2["rewards"])) reporter(**collect_metrics(remote_workers=workers))
def _set_avg_br_rew_deque(worker: RolloutWorker): worker.avg_br_reward_deque = avg_br_reward_deque
def _set_worker_converters(worker: RolloutWorker): worker_delegate_policy = worker.policy_map[delegate_policy_id] for p, player_converter in player_converters.items(): worker.foreach_env(lambda env: env.set_obs_conversion_dict(p, player_converter)) worker_delegate_policy.player_converters = player_converters
def _set_p2sro_policy_spec_on_best_response_policy(worker: RolloutWorker): br_policy = worker.policy_map[f"best_response"] br_policy.p2sro_policy_spec = active_policy_spec worker.br_player = player
def _set_worker_converters(worker: RolloutWorker): worker_delegate_policy = worker.policy_map["delegate_policy"] for p in range(2): worker.foreach_env(lambda env: env.set_obs_conversion_dict(p, player_converters[p])) worker_delegate_policy.player_converters = player_converters
# Target value for KL divergence. "kl_target": 0.01, 'env_config': envconf, "num_gpus": 0, "num_workers": 1, 'batch_mode': 'complete_episodes', 'horizon': 50 } ppo_config = ppo.DEFAULT_CONFIG ppo_config.update(d) get_dock_marks = [] workers = RolloutWorker(env_creator, ppo.PPOTFPolicy, env_config=envconf, policy_config=d) with open(checkpoint, 'rb') as c: c = c.read() c = pickle.loads(c) print(list(c.keys())) workers.restore(c['worker']) fp_path = "/Users/austin/PycharmProjects/RLDock/" with open("log.pml", 'w') as fp: with open("test.pml", 'w') as f: for j in range(1): rs = workers.sample() print(rs) print(list(rs.keys())) ls = rs['actions'].shape[0] for i in range(ls):
def _set_opponent_policy_distribution_for_one_worker( worker: RolloutWorker): worker.metanash_policy_specs = metanash_policy_specs worker.metanash_weights = metanash_weights
def _set_opponent_policy_distribution_for_worker(worker: RolloutWorker): worker.opponent_policy_distribution = opponent_policy_distribution