def get_leader_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] # print(action_space.shape) # print(env.env_specs.action_space.shape) return LeaderAgent( env_specs=env.env_specs, policy=StochasticMLPPolicy( input_shapes=(env.num_state, ), output_shape=(env.action_num, ), hidden_layer_sizes=hidden_layer_sizes, # output_activation=gambel_softmax, # preprocessor='LSTM', name='policy_agent_{}'.format(agent_id) ), qf=MLPValueFunction( # input_shapes=(num_sample * 2 + 1, (env.env_specs.action_space.flat_dim,)), # input_shapes=(num_sample * 2 + 1, ), input_shapes=(env.num_state + env.action_num * 2,), output_shape=(1,), hidden_layer_sizes=hidden_layer_sizes, name='qf_agent_{}'.format(agent_id) ), replay_buffer=IndexedReplayBuffer(observation_dim=env.num_state, action_dim=env.action_num, opponent_action_dim=env.action_num, max_replay_buffer_size=max_replay_buffer_size ), exploration_strategy=OUExploration(action_space), gradient_clipping=10., agent_id=agent_id, )
def get_maddpg_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] # print(action_space.shape) # print((env.env_specs.action_space.flat_dim,)) # print(env.env_specs.action_space.shape) return MADDPGAgent( env_specs=env.env_specs, policy=DeterministicMLPPolicy( input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name='policy_agent_{}'.format(agent_id) ), qf=MLPValueFunction( input_shapes=(observation_space.shape, (env.env_specs.action_space.flat_dim * 2.5,)), output_shape=(1,), hidden_layer_sizes=hidden_layer_sizes, name='qf_agent_{}'.format(agent_id) ), replay_buffer=IndexedReplayBuffer(observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], opponent_action_dim=env.env_specs.action_space.opponent_flat_dim(agent_id), max_replay_buffer_size=max_replay_buffer_size ), exploration_strategy=OUExploration(action_space), gradient_clipping=10., agent_id=agent_id, )
def get_follower_deterministic_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] # print(action_space.shape) # print(env.env_specs.action_space.shape) return FollowerAgent( env_specs=env.env_specs, policy=DeterministicMLPPolicy( input_shapes=(env.action_num + 1, ), # 1 for action1, 1 for state output_shape=(env.action_num, ), hidden_layer_sizes=hidden_layer_sizes, name='policy_agent_{}'.format(agent_id)), qf=MLPValueFunction( # input_shapes=(1 + 1, (env.env_specs.action_space.flat_dim,)), input_shapes=(2, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name='qf_agent_{}'.format(agent_id)), replay_buffer=IndexedReplayBuffer( observation_dim=env.action_num + 1, action_dim=1, opponent_action_dim=env.env_specs.action_space.opponent_flat_dim( agent_id), max_replay_buffer_size=max_replay_buffer_size), exploration_strategy=OUExploration(action_space), gradient_clipping=10., agent_id=agent_id, )
def get_follower_q_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] return FollowerQAgent( env_specs=env.env_specs, policy=None, qf=MLPValueFunction( input_shapes=(env.num_state + env.action_num * 2,), output_shape=(1,), hidden_layer_sizes=hidden_layer_sizes, name='qf_agent_{}'.format(agent_id) ), replay_buffer=IndexedReplayBuffer(observation_dim=env.num_state + env.action_num, action_dim=env.action_num, opponent_action_dim=env.action_num, max_replay_buffer_size=max_replay_buffer_size ), exploration_strategy=OUExploration(action_space), gradient_clipping=10., agent_id=agent_id, )