示例#1
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=4)
    eval_env = CartPoleEnv(mode=4)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], []
    for i in range(num_agent):
        qf = FlattenMlp(input_size=(obs_dim * num_agent +
                                    action_dim * num_agent),
                        output_size=1,
                        **variant['qf_kwargs'])
        policy = GumbelSoftmaxMlpPolicy(input_size=obs_dim,
                                        output_size=action_dim,
                                        **variant['policy_kwargs'])
        target_qf = copy.deepcopy(qf)
        target_policy = copy.deepcopy(policy)
        eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#2
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    from rlkit.envs.ma_wrappers import MAProbDiscreteEnv
    expl_env = MAProbDiscreteEnv(CartPoleEnv(mode=4))
    eval_env = MAProbDiscreteEnv(CartPoleEnv(mode=4))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \
        [], [], [], [], []
    for i in range(num_agent):
        qf = FlattenMlp(input_size=(obs_dim * num_agent +
                                    action_dim * num_agent),
                        output_size=1,
                        **variant['qf_kwargs'])
        policy = SoftmaxMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
        target_qf = copy.deepcopy(qf)
        target_policy = copy.deepcopy(policy)
        exploration_policy = policy
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)
        exploration_policy_n.append(exploration_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#3
0
文件: maddpg.py 项目: maxiaoba/rlkit
def experiment(variant):
    from multi_differential_game import MultiDifferentialGame
    expl_env = MultiDifferentialGame(**variant['env_kwargs'])
    eval_env = MultiDifferentialGame(**variant['env_kwargs'])
    num_agent = expl_env.agent_num
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], []
    qf2_n, target_qf2_n = [], []
    qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        qf = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
            variant['qf_kwargs']['num_layer'],
        )
        target_qf = copy.deepcopy(qf)
        from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy
        policy = TanhMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
            variant['policy_kwargs']['num_layer'],
        )
        target_policy = copy.deepcopy(policy)

        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    if variant['random_exploration']:
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
        expl_policy_n = [
            PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            ) for i in range(num_agent)
        ]
    else:
        expl_policy_n = [
            PolicyWrappedWithExplorationStrategy(
                exploration_strategy=OUStrategy(
                    action_space=expl_env.action_space),
                policy=policy,
            ) for policy in policy_n
        ]
    eval_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.maddpg.maddpg import MADDPGTrainer
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            qf2_n=qf2_n,
                            target_qf2_n=target_qf2_n,
                            qf_optimizer_n=qf_optimizer_n,
                            qf2_optimizer_n=qf2_optimizer_n,
                            policy_optimizer_n=policy_optimizer_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#4
0
def experiment(variant):
    import gym
    import robosumo.envs
    from robosumo_env_wrapper import RoboSumoEnv
    expl_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args'])
    eval_env = RoboSumoEnv(gym.make('RoboSumo-{}-vs-{}-v0'.format(args.exp_name,args.exp_name)),**variant['world_args'])
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], []
    qf2_n, target_qf2_n = [], []
    qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None
    for i in range(num_agent):
        from rlkit.torch.networks.networks import FlattenMlp
        qf = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*variant['qf_kwargs']['num_layer'],
        )
        target_qf = copy.deepcopy(qf)
        from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy
        policy = TanhMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*variant['policy_kwargs']['num_layer'],
        )
        target_policy = copy.deepcopy(policy)
        
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    expl_policy_n = [PolicyWrappedWithExplorationStrategy(
                            exploration_strategy=OUStrategy(action_space=expl_env.action_space),
                            policy=policy,
                        ) for policy in policy_n]
    eval_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.maddpg.maddpg import MADDPGTrainer
    trainer = MADDPGTrainer(
        qf_n=qf_n,
        target_qf_n=target_qf_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        qf2_n = qf2_n,
        target_qf2_n = target_qf2_n,
        qf_optimizer_n=qf_optimizer_n,
        qf2_optimizer_n=qf2_optimizer_n,
        policy_optimizer_n=policy_optimizer_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
示例#5
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_epoch = variant['load_kwargs']['load_epoch']
        load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch),
                               map_location='cpu')
        qf_n = load_data['trainer/qf_n']
        target_qf_n = load_data['trainer/target_qf_n']
        qf2_n, target_qf2_n = [], []
        policy_n = load_data['trainer/policy_n']
        target_policy_n = load_data['trainer/target_policy_n']

        qf_optimizer_n = load_data['trainer/qf_optimizer_n']
        qf2_optimizer_n = None
        policy_optimizer_n = load_data['trainer/policy_optimizer_n']

        replay_buffer = load_data['replay_buffer']
    else:
        qf_n, policy_n, target_qf_n, target_policy_n, eval_policy_n, expl_policy_n = \
            [], [], [], [], [], []
        qf2_n, target_qf2_n = [], []
        qf_optimizer_n, qf2_optimizer_n, policy_optimizer_n = None, None, None
        for i in range(num_agent):
            from rlkit.torch.networks.networks import FlattenMlp
            qf = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
                variant['qf_kwargs']['num_layer'],
            )
            target_qf = copy.deepcopy(qf)
            from rlkit.torch.policies.deterministic_policies import TanhMlpPolicy
            policy = TanhMlpPolicy(
                input_size=obs_dim,
                output_size=action_dim,
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                variant['policy_kwargs']['num_layer'],
            )
            target_policy = copy.deepcopy(policy)

            qf_n.append(qf)
            policy_n.append(policy)
            target_qf_n.append(target_qf)
            target_policy_n.append(target_policy)

        from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
        replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                          expl_env,
                                          num_agent=num_agent)

    from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    expl_policy_n = [
        PolicyWrappedWithExplorationStrategy(
            exploration_strategy=OUStrategy(
                action_space=expl_env.action_space),
            policy=policy,
        ) for policy in policy_n
    ]
    eval_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.maddpg.maddpg import MADDPGTrainer
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            qf2_n=qf2_n,
                            target_qf2_n=target_qf2_n,
                            qf_optimizer_n=qf_optimizer_n,
                            qf2_optimizer_n=qf2_optimizer_n,
                            policy_optimizer_n=policy_optimizer_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
示例#6
0
文件: maddpg.py 项目: maxiaoba/rlkit
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \
        [], [], [], [], []
    qf2_n, target_qf2_n = [], []
    for i in range(num_agent):
        qf = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            **variant['qf_kwargs']
        )
        policy = TanhMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            **variant['policy_kwargs']
        )
        target_qf = copy.deepcopy(qf)
        target_policy = copy.deepcopy(policy)
        exploration_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=OUStrategy(action_space=expl_env.action_space),
            policy=policy,
        )
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)
        exploration_policy_n.append(exploration_policy)
        if variant['trainer_kwargs']['double_q']:
            qf2 = FlattenMlp(
                input_size=(obs_dim*num_agent+action_dim*num_agent),
                output_size=1,
                **variant['qf_kwargs']
            )
            target_qf2 = copy.deepcopy(qf2)
            qf2_n.append(qf2)
            target_qf2_n.append(target_qf2)

    eval_path_collector = MAMdpPathCollector(eval_env, policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)
    trainer = MADDPGTrainer(
        qf_n=qf_n,
        target_qf_n=target_qf_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        qf2_n = qf2_n,
        target_qf2_n = target_qf2_n,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()