예제 #1
0
def experiment(variant):
    num_agent = variant['num_agent']
    from sequential_differential_game import SequentialDifferentialGame
    expl_env = SequentialDifferentialGame(**variant['env_kwargs'])
    eval_env = SequentialDifferentialGame(**variant['env_kwargs'])
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf1_n, qf2_n, cactor_n, policy_n = [], [], [], []
    target_qf1_n, target_qf2_n, target_policy_n = [], [], []
    expl_policy_n, eval_policy_n = [], []
    log_alpha_n, log_calpha_n = [], []
    for i in range(num_agent):
        from rlkit.torch.networks import FlattenMlp
        qf1 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2,
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2,
        )
        target_qf2 = copy.deepcopy(qf2)
        from rlkit.torch.layers import SplitLayer
        cactor = nn.Sequential(
            nn.Linear((obs_dim*num_agent+action_dim*(num_agent-1)),variant['cactor_kwargs']['hidden_dim']),
            nn.ReLU(),
            nn.Linear(variant['cactor_kwargs']['hidden_dim'],variant['cactor_kwargs']['hidden_dim']),
            nn.ReLU(),
            SplitLayer(layers=[nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['cactor_kwargs']['hidden_dim'],action_dim)])
            )
        from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
        cactor = TanhGaussianPolicy(module=cactor)

        policy = nn.Sequential(
            nn.Linear(obs_dim,variant['policy_kwargs']['hidden_dim']),
            nn.ReLU(),
            nn.Linear(variant['policy_kwargs']['hidden_dim'],variant['policy_kwargs']['hidden_dim']),
            nn.ReLU(),
            SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim),
                                nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)])
            )
        policy = TanhGaussianPolicy(module=policy)
        target_policy = copy.deepcopy(policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy
        
        qf1_n.append(qf1)
        qf2_n.append(qf2)
        cactor_n.append(cactor)
        policy_n.append(policy)
        target_qf1_n.append(target_qf1)
        target_qf2_n.append(target_qf2)
        target_policy_n.append(target_policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

        if variant['trainer_kwargs']['state_dependent_alpha']:
            log_alpha = FlattenMlp(
                            input_size=obs_dim*num_agent,
                            output_size=1,
                            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2,
                        )
            log_calpha = FlattenMlp(
                            input_size=obs_dim*num_agent,
                            output_size=1,
                            hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*2,
                        )
            log_alpha_n.append(log_alpha)
            log_calpha_n.append(log_calpha)
        
    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)

    from rlkit.torch.prg.prg import PRGTrainer
    trainer = PRGTrainer(
        env=expl_env,
        qf1_n=qf1_n,
        target_qf1_n=target_qf1_n,
        qf2_n = qf2_n,
        target_qf2_n = target_qf2_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        cactor_n=cactor_n,
        log_alpha_n=log_alpha_n,
        log_calpha_n=log_calpha_n,
        **variant['trainer_kwargs']
    )

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
예제 #2
0
def experiment(variant):
    num_agent = variant['num_agent']
    from differential_game import DifferentialGame
    expl_env = DifferentialGame(game_name=args.exp_name)
    eval_env = DifferentialGame(game_name=args.exp_name)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, eval_policy_n, expl_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \
        [], [], [], [], [], [], []
    for i in range(num_agent):
        from rlkit.torch.layers import SplitLayer, ReshapeLayer
        weight_head = nn.Linear(variant['policy_kwargs']['hidden_dim'],
                                variant['policy_kwargs']['m'])
        mean_head = nn.Sequential(
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      action_dim * variant['policy_kwargs']['m']),
            ReshapeLayer(shape=[variant['policy_kwargs']['m'], action_dim]))
        logstd_head = nn.Sequential(
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      action_dim * variant['policy_kwargs']['m']),
            ReshapeLayer(shape=[variant['policy_kwargs']['m'], action_dim]))
        policy = nn.Sequential(
            nn.Linear(obs_dim, variant['policy_kwargs']['hidden_dim']),
            nn.ReLU(),
            nn.Linear(variant['policy_kwargs']['hidden_dim'],
                      variant['policy_kwargs']['hidden_dim']), nn.ReLU(),
            SplitLayer(layers=[weight_head, mean_head, logstd_head]))
        from rlkit.torch.policies.mix_tanh_gaussian_policy import MixTanhGaussianPolicy
        policy = MixTanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
        if variant['random_exploration']:
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy
        from rlkit.torch.networks import FlattenMlp
        qf1 = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2,
        )
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(
            input_size=(obs_dim * num_agent + action_dim * num_agent),
            output_size=1,
            hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * 2,
        )
        target_qf2 = copy.deepcopy(qf2)
        policy_n.append(policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.masac.masac import MASACTrainer
    trainer = MASACTrainer(env=expl_env,
                           qf1_n=qf1_n,
                           target_qf1_n=target_qf1_n,
                           qf2_n=qf2_n,
                           target_qf2_n=target_qf2_n,
                           policy_n=policy_n,
                           **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
예제 #3
0
def experiment(variant):
    import sys
    sys.path.append("./multiagent-particle-envs")
    from make_env import make_env
    from particle_env_wrapper import ParticleEnv
    expl_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    eval_env = ParticleEnv(
        make_env(args.exp_name,
                 discrete_action_space=False,
                 world_args=variant['world_args']))
    num_agent = expl_env.num_agent
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    if variant['load_kwargs']['load']:
        load_dir = variant['load_kwargs']['load_dir']
        load_epoch = variant['load_kwargs']['load_epoch']
        load_data = torch.load('{}/itr_{}.pkl'.format(load_dir, load_epoch),
                               map_location='cpu')
        qf1_n = load_data['trainer/qf1_n']
        target_qf1_n = load_data['trainer/target_qf1_n']
        qf2_n = load_data['trainer/qf2_n']
        target_qf2_n = load_data['trainer/target_qf2_n']
        cactor_n = load_data['trainer/cactor_n']
        policy_n = load_data['trainer/policy_n']
        log_alpha_n = load_data['trainer/log_alpha_n']

        qf1_optimizer_n = load_data['trainer/qf1_optimizer_n']
        qf2_optimizer_n = load_data['trainer/qf2_optimizer_n']
        policy_optimizer_n = load_data['trainer/policy_optimizer_n']
        cactor_optimizer_n = load_data['trainer/cactor_optimizer_n']
        alpha_optimizer_n = load_data['trainer/alpha_optimizer_n']
        if args.ce:
            log_calpha_n = load_data['trainer/log_calpha_n']
            calpha_optimizer_n = load_data['trainer/calpha_optimizer_n']

        replay_buffer = load_data['replay_buffer']
    else:
        qf1_n, qf2_n, cactor_n, policy_n = [], [], [], []
        target_qf1_n, target_qf2_n = [], []
        log_alpha_n, log_calpha_n = None, None
        qf1_optimizer_n, qf2_optimizer_n, policy_optimizer_n, cactor_optimizer_n, alpha_optimizer_n, calpha_optimizer_n  = \
            None, None, None, None, None, None
        for i in range(num_agent):
            from rlkit.torch.networks import FlattenMlp
            qf1 = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
                variant['qf_kwargs']['num_layer'],
            )
            target_qf1 = copy.deepcopy(qf1)
            qf2 = FlattenMlp(
                input_size=(obs_dim * num_agent + action_dim * num_agent),
                output_size=1,
                hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
                variant['qf_kwargs']['num_layer'],
            )
            target_qf2 = copy.deepcopy(qf2)
            from rlkit.torch.layers import SplitLayer
            if variant['trainer_kwargs']['dec_cactor']:
                input_size = obs_dim + action_dim * (num_agent - 1)
            else:
                input_size = obs_dim * num_agent + action_dim * (num_agent - 1)
            cactor = nn.Sequential(
                FlattenMlp(
                    input_size=input_size,
                    output_size=variant['cactor_kwargs']['hidden_dim'],
                    hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
                    (variant['cactor_kwargs']['num_layer'] - 1),
                ),
                SplitLayer(layers=[
                    nn.Linear(variant['cactor_kwargs']['hidden_dim'],
                              action_dim),
                    nn.Linear(variant['cactor_kwargs']['hidden_dim'],
                              action_dim)
                ]))
            from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
            cactor = TanhGaussianPolicy(module=cactor)

            policy = nn.Sequential(
                FlattenMlp(
                    input_size=obs_dim,
                    output_size=variant['policy_kwargs']['hidden_dim'],
                    hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                    (variant['policy_kwargs']['num_layer'] - 1),
                ),
                SplitLayer(layers=[
                    nn.Linear(variant['policy_kwargs']['hidden_dim'],
                              action_dim),
                    nn.Linear(variant['policy_kwargs']['hidden_dim'],
                              action_dim)
                ]))
            policy = TanhGaussianPolicy(module=policy)

            qf1_n.append(qf1)
            qf2_n.append(qf2)
            cactor_n.append(cactor)
            policy_n.append(policy)
            target_qf1_n.append(target_qf1)
            target_qf2_n.append(target_qf2)

            from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
            replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                              expl_env,
                                              num_agent=num_agent)

    from rlkit.torch.policies.make_deterministic import MakeDeterministic
    eval_policy_n = [MakeDeterministic(policy) for policy in policy_n]
    expl_policy_n = policy_n

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)

    from rlkit.torch.prg.prg import PRGTrainer
    trainer = PRGTrainer(env=expl_env,
                         qf1_n=qf1_n,
                         target_qf1_n=target_qf1_n,
                         qf2_n=qf2_n,
                         target_qf2_n=target_qf2_n,
                         policy_n=policy_n,
                         cactor_n=cactor_n,
                         log_alpha_n=log_alpha_n,
                         log_calpha_n=log_calpha_n,
                         qf1_optimizer_n=qf1_optimizer_n,
                         qf2_optimizer_n=qf2_optimizer_n,
                         policy_optimizer_n=policy_optimizer_n,
                         cactor_optimizer_n=cactor_optimizer_n,
                         alpha_optimizer_n=alpha_optimizer_n,
                         calpha_optimizer_n=calpha_optimizer_n,
                         **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()