def test_multi_output(self): Q5 = MLPValueFunction( input_shapes=( self.env.observation_space.shape, self.env.action_space.shape, ), output_shape=(5, ), hidden_layer_sizes=self.hidden_layer_sizes, name="Q5", ) observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] action1_np = self.env.action_space.sample() action2_np = self.env.action_space.sample() observations_np = np.stack( (observation1_np, observation2_np)).astype(np.float32) actions_np = np.stack((action1_np, action2_np)) conditions = [observations_np, actions_np] q_values_np = Q5.get_values_np(conditions) q_values = Q5.get_values(conditions) self.assertEqual(q_values_np.shape, (2, 5)) self.assertEqual(q_values.shape, (2, 5))
def get_pr2_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size, policy_type="deter"): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] opponent_action_shape = ( env.env_specs.action_space.opponent_flat_dim(agent_id), ) print(opponent_action_shape, "opponent_action_shape") if policy_type == "dete": policy_fn = DeterministicMLPPolicy exploration_strategy = OUExploration(action_space) elif policy_type == "gumble": policy_fn = RelaxedSoftmaxMLPPolicy exploration_strategy = None return PR2Agent( env_specs=env.env_specs, policy=policy_fn( input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="policy_agent_{}".format(agent_id), ), qf=MLPValueFunction( input_shapes=( observation_space.shape, action_space.shape, opponent_action_shape, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="qf_agent_{}".format(agent_id), ), ind_qf=MLPValueFunction( input_shapes=(observation_space.shape, action_space.shape), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="ind_qf_agent_{}".format(agent_id), ), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size, opponent_action_dim=opponent_action_shape[0], ), opponent_policy=policy_fn( input_shapes=(observation_space.shape, action_space.shape), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_policy_agent_{}".format(agent_id), ), exploration_strategy=exploration_strategy, gradient_clipping=10.0, agent_id=agent_id, )
def setUp(self): self.env = gym.envs.make('MountainCarContinuous-v0') self.hidden_layer_sizes = (128, 128) self.Q = MLPValueFunction( input_shapes=(self.env.observation_space.shape, self.env.action_space.shape), output_shape=(1,), hidden_layer_sizes=self.hidden_layer_sizes, name='Q' ) self.V = MLPValueFunction( input_shapes=(self.env.observation_space.shape,), output_shape=(1,), hidden_layer_sizes=self.hidden_layer_sizes, name='V' )
def get_ddpgtom_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] return DDPGToMAgent( env_specs=env.env_specs, policy=DeterministicMLPPolicy(input_shapes=(observation_space.shape, ( env.env_specs.action_space.opponent_flat_dim(agent_id), )), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name='policy_agent_{}'.format(agent_id)), qf=MLPValueFunction( input_shapes=(observation_space.shape, (env.env_specs.action_space.flat_dim, )), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name='qf_agent_{}'.format(agent_id)), opponent_policy=DeterministicMLPPolicy( input_shapes=(observation_space.shape, ), output_shape=( env.env_specs.action_space.opponent_flat_dim(agent_id), ), hidden_layer_sizes=hidden_layer_sizes, name='opponent_policy_agent_{}'.format(agent_id)), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], opponent_action_dim=env.env_specs.action_space.opponent_flat_dim( agent_id), max_replay_buffer_size=max_replay_buffer_size), exploration_strategy=OUExploration(action_space), gradient_clipping=10., agent_id=agent_id, )
def get_ddpg_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size, policy_type='dete'): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] if policy_type == 'dete': policy_fn = DeterministicMLPPolicy exploration_strategy = OUExploration(action_space) elif policy_type == 'gumble': policy_fn = RelaxedSoftmaxMLPPolicy exploration_strategy = None return DDPGAgent( env_specs=env.env_specs, policy=policy_fn(input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name='policy_agent_{}'.format(agent_id)), qf=MLPValueFunction(input_shapes=(observation_space.shape, action_space.shape), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name='qf_agent_{}'.format(agent_id)), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size), exploration_strategy=exploration_strategy, gradient_clipping=10., agent_id=agent_id, )
def get_rommeo_agent( env, agent_id, hidden_layer_sizes, max_replay_buffer_size, policy_type="gaussian", uniform=False, custom_b=False, bi=1.0, bj=1.0, ): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] opponent_action_shape = ( env.env_specs.action_space.opponent_flat_dim(agent_id), ) if policy_type == "gaussian": policy_fn = GaussianMLPPolicy elif policy_type == "gumble": policy_fn = RelaxedSoftmaxMLPPolicy return ROMMEOAgent( env_specs=env.env_specs, policy=policy_fn( input_shapes=(observation_space.shape, opponent_action_shape), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="policy_agent_{}".format(agent_id), repara=True, # smoothing_coefficient=0.5 ), qf=MLPValueFunction( input_shapes=( observation_space.shape, action_space.shape, opponent_action_shape, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="qf_agent_{}".format(agent_id), ), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size, opponent_action_dim=opponent_action_shape[0], ), opponent_policy=policy_fn( input_shapes=(observation_space.shape, ), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_policy_agent_{}".format(agent_id), repara=True, ), gradient_clipping=10, agent_id=agent_id, name="ROMMEO_{}".format(agent_id), uniform=uniform, custom_b=custom_b, bi=bi, bj=bj, )
def get_pr2k_soft_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size, k=2, mu=0): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] opponent_action_shape = ( env.env_specs.action_space.opponent_flat_dim(agent_id), ) print(opponent_action_shape, "opponent_action_shape") return PR2KSoftAgent( env_specs=env.env_specs, main_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, opponent_action_shape), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="policy_agent_{}".format(agent_id), ), opponent_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, action_space.shape), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_policy_agent_{}".format(agent_id), ), prior_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="prior_policy_agent_{}".format(agent_id), ), opponent_prior_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, ), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_prior_policy_agent_{}".format(agent_id), ), qf=MLPValueFunction( input_shapes=( observation_space.shape, action_space.shape, opponent_action_shape, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="qf_agent_{}".format(agent_id), ), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size, opponent_action_dim=opponent_action_shape[0], ), k=k, mu=mu, gradient_clipping=10.0, agent_id=agent_id, )
def get_sac_agent(env, hidden_layer_sizes, max_replay_buffer_size, policy_type="gaussian"): """ SAC agent for single player learning. """ observation_space = env.env_specs.observation_space[0] action_space = env.env_specs.action_space[0] env_specs = env.env_specs if policy_type == "gaussian": policy_fn = GaussianMLPPolicy elif policy_type == "gumble": policy_fn = RelaxedSoftmaxMLPPolicy # print('observation_space.shape', observation_space.shape) return SACAgent( env_specs=env_specs, policy=policy_fn( input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="{}_policy".format(policy_type), ), qfs=[ MLPValueFunction( input_shapes=(observation_space.shape, action_space.shape), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="qf_{}".format(qf_id), ) for qf_id in range(2) ], vf=MLPValueFunction( input_shapes=(observation_space.shape, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="vf", ), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size, ), )
def get_pr2_soft_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size, policy_type="gaussian"): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] opponent_action_shape = ( env.env_specs.action_space.opponent_flat_dim(agent_id), ) if policy_type == "gaussian": policy_fn = GaussianMLPPolicy elif policy_type == "gumble": policy_fn = RelaxedSoftmaxMLPPolicy return PR2SoftAgent( env_specs=env.env_specs, policy=policy_fn( input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="policy_agent_{}".format(agent_id), ), qf=MLPValueFunction( input_shapes=( observation_space.shape, action_space.shape, opponent_action_shape, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="qf_agent_{}".format(agent_id), ), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size, opponent_action_dim=opponent_action_shape[0], ), opponent_policy=policy_fn( input_shapes=(observation_space.shape, action_space.shape), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_policy_agent_{}".format(agent_id), ), gradient_clipping=10.0, agent_id=agent_id, )
class ValueFunctionTest(tf.test.TestCase): def setUp(self): self.env = gym.envs.make('MountainCarContinuous-v0') self.hidden_layer_sizes = (128, 128) self.Q = MLPValueFunction( input_shapes=(self.env.observation_space.shape, self.env.action_space.shape), output_shape=(1,), hidden_layer_sizes=self.hidden_layer_sizes, name='Q' ) self.V = MLPValueFunction( input_shapes=(self.env.observation_space.shape,), output_shape=(1,), hidden_layer_sizes=self.hidden_layer_sizes, name='V' ) def test_multi_output(self): Q5 = MLPValueFunction( input_shapes=(self.env.observation_space.shape, self.env.action_space.shape), output_shape=(5,), hidden_layer_sizes=self.hidden_layer_sizes, name='Q5' ) observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] action1_np = self.env.action_space.sample() action2_np = self.env.action_space.sample() observations_np = np.stack((observation1_np, observation2_np)).astype(np.float32) actions_np = np.stack((action1_np, action2_np)) conditions = [observations_np, actions_np] q_values_np = Q5.get_values_np(conditions) q_values = Q5.get_values(conditions) self.assertEqual(q_values_np.shape, (2, 5)) self.assertEqual(q_values.shape, (2, 5)) def test_values(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] action1_np = self.env.action_space.sample() action2_np = self.env.action_space.sample() observations_np = np.stack((observation1_np, observation2_np)).astype(np.float32) actions_np = np.stack((action1_np, action2_np)) conditions = [observations_np, actions_np] q_values_np = self.Q.get_values_np(conditions) q_values = self.Q.get_values(conditions) v_values_np = self.V.get_values_np([observations_np]) v_values = self.V.get_values([observations_np]) self.assertEqual(q_values_np.shape, (2, 1)) self.assertEqual(q_values.shape, (2, 1)) self.assertEqual(v_values_np.shape, (2, 1)) self.assertEqual(v_values.shape, (2, 1)) def test_trainable_variables(self): self.assertEqual( len(self.Q.trainable_variables), 2 * (len(self.hidden_layer_sizes) + 1)) self.assertEqual( len(self.V.trainable_variables), 2 * (len(self.hidden_layer_sizes) + 1)) def test_clone_target(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack( (observation1_np, observation2_np) ).astype(np.float32) weights = self.V.get_weights() values_np = self.V.get_values_np([observations_np]) target_name = '{}_{}'.format('target', self.V._name) target_V = Serializable.clone(self.V, name=target_name) weights_2 = target_V.get_weights() self.assertEqual(target_V._name, target_name) self.assertIsNot(weights, weights_2) for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight.shape, weight_2.shape) np.testing.assert_equal( values_np.shape, target_V.get_values_np([observations_np]).shape) def test_serialize_deserialize(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack( (observation1_np, observation2_np) ).astype(np.float32) weights = self.V.get_weights() values_np = self.V.get_values_np([observations_np]) serialized = pickle.dumps(self.V) deserialized = pickle.loads(serialized) weights_2 = deserialized.get_weights() for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight, weight_2) np.testing.assert_equal( values_np.shape, deserialized.get_values_np([observations_np]).shape)