예제 #1
0
    def test_multi_output(self):
        Q5 = MLPValueFunction(
            input_shapes=(
                self.env.observation_space.shape,
                self.env.action_space.shape,
            ),
            output_shape=(5, ),
            hidden_layer_sizes=self.hidden_layer_sizes,
            name="Q5",
        )
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        action1_np = self.env.action_space.sample()
        action2_np = self.env.action_space.sample()
        observations_np = np.stack(
            (observation1_np, observation2_np)).astype(np.float32)

        actions_np = np.stack((action1_np, action2_np))

        conditions = [observations_np, actions_np]

        q_values_np = Q5.get_values_np(conditions)
        q_values = Q5.get_values(conditions)

        self.assertEqual(q_values_np.shape, (2, 5))
        self.assertEqual(q_values.shape, (2, 5))
예제 #2
0
def get_pr2_agent(env,
                  agent_id,
                  hidden_layer_sizes,
                  max_replay_buffer_size,
                  policy_type="deter"):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    if policy_type == "dete":
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return PR2Agent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        ind_qf=MLPValueFunction(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="ind_qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
예제 #3
0
 def setUp(self):
     self.env = gym.envs.make('MountainCarContinuous-v0')
     self.hidden_layer_sizes = (128, 128)
     self.Q = MLPValueFunction(
         input_shapes=(self.env.observation_space.shape, self.env.action_space.shape),
         output_shape=(1,),
         hidden_layer_sizes=self.hidden_layer_sizes,
         name='Q'
     )
     self.V = MLPValueFunction(
         input_shapes=(self.env.observation_space.shape,),
         output_shape=(1,),
         hidden_layer_sizes=self.hidden_layer_sizes,
         name='V'
     )
예제 #4
0
def get_ddpgtom_agent(env, agent_id, hidden_layer_sizes,
                      max_replay_buffer_size):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    return DDPGToMAgent(
        env_specs=env.env_specs,
        policy=DeterministicMLPPolicy(input_shapes=(observation_space.shape, (
            env.env_specs.action_space.opponent_flat_dim(agent_id), )),
                                      output_shape=action_space.shape,
                                      hidden_layer_sizes=hidden_layer_sizes,
                                      name='policy_agent_{}'.format(agent_id)),
        qf=MLPValueFunction(
            input_shapes=(observation_space.shape,
                          (env.env_specs.action_space.flat_dim, )),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name='qf_agent_{}'.format(agent_id)),
        opponent_policy=DeterministicMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=(
                env.env_specs.action_space.opponent_flat_dim(agent_id), ),
            hidden_layer_sizes=hidden_layer_sizes,
            name='opponent_policy_agent_{}'.format(agent_id)),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            opponent_action_dim=env.env_specs.action_space.opponent_flat_dim(
                agent_id),
            max_replay_buffer_size=max_replay_buffer_size),
        exploration_strategy=OUExploration(action_space),
        gradient_clipping=10.,
        agent_id=agent_id,
    )
예제 #5
0
def get_ddpg_agent(env,
                   agent_id,
                   hidden_layer_sizes,
                   max_replay_buffer_size,
                   policy_type='dete'):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    if policy_type == 'dete':
        policy_fn = DeterministicMLPPolicy
        exploration_strategy = OUExploration(action_space)
    elif policy_type == 'gumble':
        policy_fn = RelaxedSoftmaxMLPPolicy
        exploration_strategy = None
    return DDPGAgent(
        env_specs=env.env_specs,
        policy=policy_fn(input_shapes=(observation_space.shape, ),
                         output_shape=action_space.shape,
                         hidden_layer_sizes=hidden_layer_sizes,
                         name='policy_agent_{}'.format(agent_id)),
        qf=MLPValueFunction(input_shapes=(observation_space.shape,
                                          action_space.shape),
                            output_shape=(1, ),
                            hidden_layer_sizes=hidden_layer_sizes,
                            name='qf_agent_{}'.format(agent_id)),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size),
        exploration_strategy=exploration_strategy,
        gradient_clipping=10.,
        agent_id=agent_id,
    )
예제 #6
0
def get_rommeo_agent(
    env,
    agent_id,
    hidden_layer_sizes,
    max_replay_buffer_size,
    policy_type="gaussian",
    uniform=False,
    custom_b=False,
    bi=1.0,
    bj=1.0,
):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    return ROMMEOAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
            repara=True,
            # smoothing_coefficient=0.5
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
            repara=True,
        ),
        gradient_clipping=10,
        agent_id=agent_id,
        name="ROMMEO_{}".format(agent_id),
        uniform=uniform,
        custom_b=custom_b,
        bi=bi,
        bj=bj,
    )
예제 #7
0
def get_pr2k_soft_agent(env,
                        agent_id,
                        hidden_layer_sizes,
                        max_replay_buffer_size,
                        k=2,
                        mu=0):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    return PR2KSoftAgent(
        env_specs=env.env_specs,
        main_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        opponent_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="prior_policy_agent_{}".format(agent_id),
        ),
        opponent_prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_prior_policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        k=k,
        mu=mu,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
예제 #8
0
def get_sac_agent(env,
                  hidden_layer_sizes,
                  max_replay_buffer_size,
                  policy_type="gaussian"):
    """
    SAC agent for single player learning.
    """
    observation_space = env.env_specs.observation_space[0]
    action_space = env.env_specs.action_space[0]
    env_specs = env.env_specs
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    # print('observation_space.shape', observation_space.shape)
    return SACAgent(
        env_specs=env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="{}_policy".format(policy_type),
        ),
        qfs=[
            MLPValueFunction(
                input_shapes=(observation_space.shape, action_space.shape),
                output_shape=(1, ),
                hidden_layer_sizes=hidden_layer_sizes,
                name="qf_{}".format(qf_id),
            ) for qf_id in range(2)
        ],
        vf=MLPValueFunction(
            input_shapes=(observation_space.shape, ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="vf",
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
        ),
    )
예제 #9
0
def get_pr2_soft_agent(env,
                       agent_id,
                       hidden_layer_sizes,
                       max_replay_buffer_size,
                       policy_type="gaussian"):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    if policy_type == "gaussian":
        policy_fn = GaussianMLPPolicy
    elif policy_type == "gumble":
        policy_fn = RelaxedSoftmaxMLPPolicy
    return PR2SoftAgent(
        env_specs=env.env_specs,
        policy=policy_fn(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        opponent_policy=policy_fn(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
예제 #10
0
class ValueFunctionTest(tf.test.TestCase):
    def setUp(self):
        self.env = gym.envs.make('MountainCarContinuous-v0')
        self.hidden_layer_sizes = (128, 128)
        self.Q = MLPValueFunction(
            input_shapes=(self.env.observation_space.shape, self.env.action_space.shape),
            output_shape=(1,),
            hidden_layer_sizes=self.hidden_layer_sizes,
            name='Q'
        )
        self.V = MLPValueFunction(
            input_shapes=(self.env.observation_space.shape,),
            output_shape=(1,),
            hidden_layer_sizes=self.hidden_layer_sizes,
            name='V'
        )

    def test_multi_output(self):
        Q5 = MLPValueFunction(
            input_shapes=(self.env.observation_space.shape, self.env.action_space.shape),
            output_shape=(5,),
            hidden_layer_sizes=self.hidden_layer_sizes,
            name='Q5'
        )
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        action1_np = self.env.action_space.sample()
        action2_np = self.env.action_space.sample()
        observations_np = np.stack((observation1_np, observation2_np)).astype(np.float32)

        actions_np = np.stack((action1_np, action2_np))

        conditions = [observations_np, actions_np]

        q_values_np = Q5.get_values_np(conditions)
        q_values = Q5.get_values(conditions)

        self.assertEqual(q_values_np.shape, (2, 5))
        self.assertEqual(q_values.shape, (2, 5))

    def test_values(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        action1_np = self.env.action_space.sample()
        action2_np = self.env.action_space.sample()
        observations_np = np.stack((observation1_np, observation2_np)).astype(np.float32)

        actions_np = np.stack((action1_np, action2_np))

        conditions = [observations_np, actions_np]

        q_values_np = self.Q.get_values_np(conditions)
        q_values = self.Q.get_values(conditions)

        v_values_np = self.V.get_values_np([observations_np])
        v_values = self.V.get_values([observations_np])

        self.assertEqual(q_values_np.shape, (2, 1))
        self.assertEqual(q_values.shape, (2, 1))
        self.assertEqual(v_values_np.shape, (2, 1))
        self.assertEqual(v_values.shape, (2, 1))

    def test_trainable_variables(self):
        self.assertEqual(
            len(self.Q.trainable_variables),
            2 * (len(self.hidden_layer_sizes) + 1))
        self.assertEqual(
            len(self.V.trainable_variables),
            2 * (len(self.hidden_layer_sizes) + 1))

    def test_clone_target(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack(
            (observation1_np, observation2_np)
        ).astype(np.float32)

        weights = self.V.get_weights()
        values_np = self.V.get_values_np([observations_np])

        target_name = '{}_{}'.format('target', self.V._name)
        target_V = Serializable.clone(self.V, name=target_name)

        weights_2 = target_V.get_weights()

        self.assertEqual(target_V._name, target_name)
        self.assertIsNot(weights, weights_2)
        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight.shape, weight_2.shape)
        np.testing.assert_equal(
            values_np.shape, target_V.get_values_np([observations_np]).shape)

    def test_serialize_deserialize(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack(
            (observation1_np, observation2_np)
        ).astype(np.float32)

        weights = self.V.get_weights()
        values_np = self.V.get_values_np([observations_np])

        serialized = pickle.dumps(self.V)
        deserialized = pickle.loads(serialized)

        weights_2 = deserialized.get_weights()

        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight, weight_2)
        np.testing.assert_equal(
            values_np.shape, deserialized.get_values_np([observations_np]).shape)