示例#1
0
def training_workflow(config, reporter):
    from gym.spaces import Box
    import numpy as np
    env_maker = get_env_maker(GazeboEnv)
    env, agents = env_maker(config['env_config'], return_agents=True)
    space = Box(low=-np.ones(2), high=np.ones(2))
    # pdb.set_trace()
    replay_buffers = {
        agent_id: ReplayBuffer(config.get('buffer_size', 1000))
        for agent_id in agents
    }
    policy = {
        k: (RandomPolicy, a.observation_space, a.action_space, {})
        for k, a in agents.items()
    }
    worker = RolloutWorker(lambda x: env,
                           policy=policy,
                           batch_steps=32,
                           policy_mapping_fn=lambda x: x,
                           episode_horizon=20)
    for i in range(config['num_iters']):
        T1 = SampleBatch.concat_samples([worker.sample()])
        for agent_id, batch in T1.policy_batches.items():
            for row in batch.rows():
                replay_buffers[agent_id].add(row['obs'],
                                             row['actions'],
                                             row['rewards'],
                                             row['new_obs'],
                                             row['dones'],
                                             weight=None)
    pdb.set_trace()
 def _set_worker_converters(worker: RolloutWorker):
     worker_delegate_policy = worker.policy_map["delegate_policy"]
     player_converters = []
     for p in range(2):
         player_converter = RestrictedToBaseGameActionSpaceConverter(
             delegate_policy=worker_delegate_policy, policy_specs=player_to_base_game_action_specs[p],
             load_policy_spec_fn=create_get_pure_strat_cached(cache=weights_cache))
         player_converters.append(player_converter)
         worker.foreach_env(lambda env: env.set_action_conversion(p, player_converter))
     worker_delegate_policy.player_converters = player_converters
示例#3
0
    def _set_conversions(worker: RolloutWorker):

        def _set_restricted_env_convertions(restricted_env):
            assert isinstance(restricted_env, RestrictedGame)
            for agent_id, action_policy_specs in agent_id_to_restricted_game_specs.items():
                if len(action_policy_specs) > 0:
                    convertor = RestrictedToBaseGameActionSpaceConverter(
                        delegate_policy=worker.policy_map[delegate_policy_id],
                        policy_specs=action_policy_specs,
                        load_policy_spec_fn=load_policy_spec_fn)
                    restricted_env.set_action_conversion(agent_id=agent_id, converter=convertor)

        worker.foreach_env(_set_restricted_env_convertions)
示例#4
0
        def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv,
                           policies: Dict[str, Policy], episode: MultiAgentEpisode,
                           env_index: int, **kwargs):

            # If using P2SRO, report payoff results of the actively training BR to the payoff table.
            if not p2sro:
                return

            if not hasattr(worker, "p2sro_manager"):
                worker.p2sro_manager = RemoteP2SROManagerClient(n_players=2,
                                                                port=psro_manager_port,
                                                                remote_server_host=psro_manager_host)

            br_policy_spec: StrategySpec = worker.policy_map["best_response"].policy_spec
            if br_policy_spec.pure_strat_index_for_player(player=worker.br_player) == 0:
                # We're training policy 0 if True (first iteration of PSRO).
                # The PSRO subgame should be empty, and instead the metanash is a random neural network.
                # No need to report payoff results for this.
                return

            # Report payoff results for individual episodes to the p2sro manager to keep a real-time approximation
            # of the payoff matrix entries for (learning) active policies.
            policy_specs_for_each_player: List[StrategySpec] = [None, None]
            payoffs_for_each_player: List[float] = [None, None]
            for (player, policy_name), reward in episode.agent_rewards.items():
                assert policy_name in ["best_response", "metanash"]
                policy: Policy = worker.policy_map[policy_name]
                assert policy.policy_spec is not None
                policy_specs_for_each_player[player] = policy.policy_spec
                payoffs_for_each_player[player] = reward
            assert all(payoff is not None for payoff in payoffs_for_each_player)

            # Send payoff result to the manager for inclusion in the payoff table.
            worker.p2sro_manager.submit_empirical_payoff_result(
                policy_specs_for_each_player=tuple(policy_specs_for_each_player),
                payoffs_for_each_player=tuple(payoffs_for_each_player),
                games_played=1,
                override_all_previous_results=False)
示例#5
0
def training_workflow(config, reporter):
    # Setup policy and policy evaluation actors
    env = gym.make("CartPole-v0")
    policy = CustomPolicy(env.observation_space, env.action_space, {})
    workers = [
        RolloutWorker.as_remote().remote(
            env_creator=lambda c: gym.make("CartPole-v0"), policy=CustomPolicy)
        for _ in range(config["num_workers"])
    ]

    for _ in range(config["num_iters"]):
        # Broadcast weights to the policy evaluation workers
        weights = ray.put({DEFAULT_POLICY_ID: policy.get_weights()})
        for w in workers:
            w.set_weights.remote(weights)

        # Gather a batch of samples
        T1 = SampleBatch.concat_samples(
            ray.get([w.sample.remote() for w in workers]))

        # Update the remote policy replicas and gather another batch of samples
        new_value = policy.w * 2.0
        for w in workers:
            w.for_policy.remote(lambda p: p.update_some_value(new_value))

        # Gather another batch of samples
        T2 = SampleBatch.concat_samples(
            ray.get([w.sample.remote() for w in workers]))

        # Improve the policy using the T1 batch
        policy.learn_on_batch(T1)

        # Do some arbitrary updates based on the T2 batch
        policy.update_some_value(sum(T2["rewards"]))

        reporter(**collect_metrics(remote_workers=workers))
示例#6
0
 def _set_avg_br_rew_deque(worker: RolloutWorker):
     worker.avg_br_reward_deque = avg_br_reward_deque
示例#7
0
 def _set_worker_converters(worker: RolloutWorker):
     worker_delegate_policy = worker.policy_map[delegate_policy_id]
     for p, player_converter in player_converters.items():
         worker.foreach_env(lambda env: env.set_obs_conversion_dict(p, player_converter))
     worker_delegate_policy.player_converters = player_converters
示例#8
0
 def _set_p2sro_policy_spec_on_best_response_policy(worker: RolloutWorker):
     br_policy = worker.policy_map[f"best_response"]
     br_policy.p2sro_policy_spec = active_policy_spec
     worker.br_player = player
 def _set_worker_converters(worker: RolloutWorker):
     worker_delegate_policy = worker.policy_map["delegate_policy"]
     for p in range(2):
         worker.foreach_env(lambda env: env.set_obs_conversion_dict(p, player_converters[p]))
     worker_delegate_policy.player_converters = player_converters
示例#10
0
文件: runner.py 项目: aclyde11/RLDock
        # Target value for KL divergence.
        "kl_target": 0.01,
        'env_config': envconf,
        "num_gpus": 0,
        "num_workers": 1,
        'batch_mode': 'complete_episodes',
        'horizon': 50
    }

    ppo_config = ppo.DEFAULT_CONFIG
    ppo_config.update(d)

    get_dock_marks = []

    workers = RolloutWorker(env_creator,
                            ppo.PPOTFPolicy,
                            env_config=envconf,
                            policy_config=d)
    with open(checkpoint, 'rb') as c:
        c = c.read()
        c = pickle.loads(c)
        print(list(c.keys()))
        workers.restore(c['worker'])
    fp_path = "/Users/austin/PycharmProjects/RLDock/"
    with open("log.pml", 'w') as fp:
        with open("test.pml", 'w') as f:
            for j in range(1):
                rs = workers.sample()
                print(rs)
                print(list(rs.keys()))
                ls = rs['actions'].shape[0]
                for i in range(ls):
 def _set_opponent_policy_distribution_for_one_worker(
         worker: RolloutWorker):
     worker.metanash_policy_specs = metanash_policy_specs
     worker.metanash_weights = metanash_weights
示例#12
0
 def _set_opponent_policy_distribution_for_worker(worker: RolloutWorker):
     worker.opponent_policy_distribution = opponent_policy_distribution