def experiment(variant): num_agent = variant['num_agent'] from sequential_differential_game import SequentialDifferentialGame expl_env = SequentialDifferentialGame(**variant['env_kwargs']) eval_env = SequentialDifferentialGame(**variant['env_kwargs']) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, use_attention=variant['graph_kwargs']['use_attention'], num_layer=variant['graph_kwargs']['num_layer'], node_dim=variant['graph_kwargs']['hidden_dim'], output_activation='relu', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp(input_size=variant['graph_kwargs']['hidden_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), ), nn.ReLU(), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): import sys sys.path.append("./multiagent-particle-envs") from make_env import make_env from particle_env_wrapper import ParticleEnv expl_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) eval_env = ParticleEnv( make_env(args.exp_name, discrete_action_space=False, world_args=variant['world_args'])) num_agent = expl_env.num_agent obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet(graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet(graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs']) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp( input_size=variant['graph_kwargs']['node_dim'] + action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( pre_graph_builder=graph_builder_ca, node_dim=variant['graph_kwargs']['node_dim'], conv_type='GSage', num_conv_layers=variant['graph_kwargs']['num_layer'], hidden_activation='lrelu0.2', output_activation='lrelu0.2', ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
def experiment(variant): from multi_differential_game import MultiDifferentialGame expl_env = MultiDifferentialGame(**variant['env_kwargs']) eval_env = MultiDifferentialGame(**variant['env_kwargs']) num_agent = expl_env.agent_num obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_1 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.graph_context_network import GraphContextNet cg1 = GraphContextNet( graph_builder_1, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs'] ) target_cg1 = copy.deepcopy(cg1) from rlkit.torch.networks.networks import FlattenMlp qf1 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf1 = copy.deepcopy(qf1) graph_builder_2 = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cg2 = GraphContextNet( graph_builder_2, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs'] ) target_cg2 = copy.deepcopy(cg2) qf2 = FlattenMlp(input_size=variant['graph_kwargs']['node_dim']+action_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']]*(variant['qf_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim+action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) cgca = GraphContextNet( graph_builder_ca, obs_dim, action_dim, output_activation='lrelu0.2', **variant['graph_kwargs'] ) from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( FlattenMlp(input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']]*(variant['cactor_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) cactor = TanhGaussianPolicy(module=cactor) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp(input_size=obs_dim, output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']]*(variant['policy_kwargs']['num_layer']-1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'],action_dim)]) ) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn3 import R2GGNNTrainer trainer = R2GGNNTrainer( env=expl_env, cg1=cg1, target_cg1=target_cg1, qf1=qf1, target_qf1=target_qf1, cg2=cg2, target_cg2=target_cg2, qf2=qf2, target_qf2=target_qf2, cgca=cgca, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs'] ) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()