Exemplo n.º 1
0
    def test_clone_target(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack((observation1_np, observation2_np)).astype(
            np.float32
        )

        weights = self.policy.get_weights()
        actions_np = self.policy.get_actions_np([observations_np])
        log_pis_np = self.policy.log_pis_np([observations_np], actions_np)

        target_name = "{}_{}".format("target", self.policy._name)
        target_policy = Serializable.clone(self.policy, name=target_name)

        weights_2 = target_policy.get_weights()
        log_pis_np_2 = target_policy.log_pis_np([observations_np], actions_np)

        self.assertEqual(target_policy._name, target_name)
        self.assertIsNot(weights, weights_2)
        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight.shape, weight_2.shape)
        np.testing.assert_array_equal(log_pis_np.shape, log_pis_np_2.shape)
        np.testing.assert_equal(
            actions_np.shape, self.policy.get_actions_np([observations_np]).shape
        )
Exemplo n.º 2
0
    def test_clone_target(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack(
            (observation1_np, observation2_np)
        ).astype(np.float32)

        weights = self.V.get_weights()
        values_np = self.V.get_values_np([observations_np])

        target_name = '{}_{}'.format('target', self.V._name)
        target_V = Serializable.clone(self.V, name=target_name)

        weights_2 = target_V.get_weights()

        self.assertEqual(target_V._name, target_name)
        self.assertIsNot(weights, weights_2)
        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight.shape, weight_2.shape)
        np.testing.assert_equal(
            values_np.shape, target_V.get_values_np([observations_np]).shape)
Exemplo n.º 3
0
    def __init__(self,
                 env_specs,  # (obs_space, act_space)
                 policy,
                 qfs,
                 vf,
                 replay_buffer,
                 policy_optimizer=tf.optimizers.Adam(),
                 qfs_optimizers=(tf.optimizers.Adam(), tf.optimizers.Adam()),
                 vf_optimizer=tf.optimizers.Adam(),
                 exploration_strategy=None,
                 exploration_interval=10,
                 target_update_tau=0.01,
                 target_update_period=10,
                 td_errors_loss_fn=None,  # TODO : fancy to me
                 alpha=0.05,
                 gamma=0.95,
                 reward_scale=1.0,
                 gradient_clipping=None,
                 train_sequence_length=None,
                 name='SAC',
                 agent_id=-1):

        self._Serializable__initialize(locals())
        self._env_specs = env_specs

        observation_space = self._env_specs.observation_space
        action_space = self._env_specs.action_space
        # observation_space = self._env_specs[0]
        # action_space = self._env_specs[1]

        self._exploration_strategy = exploration_strategy

        self._target_vf = Serializable.clone(vf, name='target_vf')

        self._policy_optimizer = policy_optimizer
        self._qfs_optimizers = qfs_optimizers
        self._vf_optimizer = vf_optimizer

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period

        self._td_errors_loss_fn = (td_errors_loss_fn or tf.losses.Huber)

        self._gamma = gamma
        self._reward_scale = reward_scale
        self._gradient_clipping = gradient_clipping
        self._train_step = 0
        self._exploration_interval = exploration_interval
        self._exploration_status = False

        self.required_experiences = ['observation', 'actions', 'rewards', 'next_observations',
                                     'terminals', 'annealing']

        self._qfs = qfs  # here we have duel q-functions, do it in SAC instead of OffPolicyAgent
        self._vf = vf

        super(SACAgent, self).__init__(
            observation_space,
            action_space,
            policy,
            qfs,
            replay_buffer,
            train_sequence_length=train_sequence_length,
            name=name,
        )
Exemplo n.º 4
0
    def __init__(self,
                 env_specs,
                 policy,
                 qf,
                 replay_buffer,
                 opponent_policy,
                 policy_optimizer=tf.optimizers.Adam(1e-3),
                 qf_optimizer=tf.optimizers.Adam(1e-3),
                 opponent_policy_optimizer=tf.optimizers.Adam(1e-3),
                 opponent_prior_optimizer=tf.optimizers.Adam(1e-3),
                 exploration_strategy=None,
                 target_update_tau=0.01,
                 target_update_period=1,
                 td_errors_loss_fn=None,
                 gamma=0.95,
                 reward_scale=1.0,
                 gradient_clipping=None,
                 train_sequence_length=None,
                 name='PR2',
                 agent_id=-1):
        self._Serializable__initialize(locals())
        self._agent_id = agent_id
        self._env_specs = env_specs

        if self._agent_id >= 0:
            observation_space = self._env_specs.observation_space[
                self._agent_id]
            action_space = self._env_specs.action_space[self._agent_id]
            opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim(
                self._agent_id)
            opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim(
                self._agent_id)
        else:
            observation_space = self._env_specs.observation_space
            action_space = self._env_specs.action_space

        self._exploration_strategy = exploration_strategy
        self._target_policy = None

        self._opponent_policy = opponent_policy
        self._prior = Serializable.clone(self._opponent_policy,
                                         name='prior_{}'.format(
                                             self._agent_id))

        self._target_qf = Serializable.clone(qf,
                                             name='target_qf_agent_{}'.format(
                                                 self._agent_id))

        self._actor_optimizer = policy_optimizer
        self._critic_optimizer = qf_optimizer
        self._opponent_policy_optimizer = opponent_policy_optimizer
        self._opponent_prior_optimizer = opponent_prior_optimizer

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._td_errors_loss_fn = (td_errors_loss_fn or tf.losses.Huber)
        self._gamma = gamma
        self._reward_scale = reward_scale
        self._gradient_clipping = gradient_clipping
        self._train_step = 0
        self._exploration_interval = 10
        self._exploration_status = True

        self.required_experiences = [
            'observation', 'actions', 'rewards', 'next_observations',
            'opponent_actions', 'terminals', 'annealing',
            'recent_observations', 'recent_opponent_actions'
        ]

        super(PR2SoftAgent,
              self).__init__(observation_space,
                             action_space,
                             policy,
                             qf,
                             replay_buffer,
                             train_sequence_length=train_sequence_length,
                             name=name)
Exemplo n.º 5
0
    def __init__(
        self,
        env_specs,
        policy,
        qf,
        replay_buffer,
        policy_optimizer=tf.optimizers.Adam(),
        qf_optimizer=tf.optimizers.Adam(),
        exploration_strategy=None,
        exploration_interval=10,
        target_update_tau=0.01,
        target_update_period=1,
        td_errors_loss_fn=None,
        gamma=0.95,
        reward_scale=1.0,
        gradient_clipping=None,
        train_sequence_length=None,
        name="MADDPG",
        agent_id=-1,
    ):
        self._Serializable__initialize(locals())
        self._agent_id = agent_id
        self._env_specs = env_specs
        if self._agent_id >= 0:
            observation_space = self._env_specs.observation_space[
                self._agent_id]
            action_space = self._env_specs.action_space[self._agent_id]
        else:
            observation_space = self._env_specs.observation_space
            action_space = self._env_specs.action_space

        self._exploration_strategy = exploration_strategy

        self._target_policy = Serializable.clone(
            policy, name="target_policy_agent_{}".format(self._agent_id))
        self._target_qf = Serializable.clone(qf,
                                             name="target_qf_agent_{}".format(
                                                 self._agent_id))

        self._policy_optimizer = policy_optimizer
        self._qf_optimizer = qf_optimizer

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber
        self._gamma = gamma
        self._reward_scale = reward_scale
        self._gradient_clipping = gradient_clipping
        self._train_step = 0
        self._exploration_interval = exploration_interval
        self._exploration_status = False

        self.required_experiences = [
            "observation",
            "actions",
            "rewards",
            "next_observations",
            "opponent_actions",
            "target_actions",
        ]

        super(MADDPGAgent, self).__init__(
            observation_space,
            action_space,
            policy,
            qf,
            replay_buffer,
            train_sequence_length=train_sequence_length,
            name=name,
        )
Exemplo n.º 6
0
    def __init__(
        self,
        env_specs,
        policy,
        qf,
        ind_qf,
        replay_buffer,
        opponent_policy,
        policy_optimizer=tf.optimizers.Adam(),
        qf_optimizer=tf.optimizers.Adam(),
        opponent_policy_optimizer=tf.optimizers.Adam(),
        value_n_particles=16,
        kernel_update_ratio=0.5,
        exploration_strategy=None,
        target_update_tau=0.01,
        target_update_period=1,
        td_errors_loss_fn=None,
        gamma=0.95,
        reward_scale=1.0,
        gradient_clipping=None,
        train_sequence_length=None,
        loss_type="svgd",
        name="PR2",
        agent_id=-1,
    ):
        self._Serializable__initialize(locals())
        self._agent_id = agent_id
        self._env_specs = env_specs
        self._value_n_particles = value_n_particles
        self._kernel_update_ratio = kernel_update_ratio
        self._loss_type = loss_type

        observation_space = self._env_specs.observation_space[self._agent_id]
        action_space = self._env_specs.action_space[self._agent_id]
        self._observation_flat_dim = self._env_specs.observation_space.agent_flat_dim(
            self._agent_id
        )
        self._action_flat_dim = self._env_specs.action_space.agent_flat_dim(
            self._agent_id
        )
        self._opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim(
            self._agent_id
        )
        self._opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim(
            self._agent_id
        )

        self._exploration_strategy = exploration_strategy
        self._target_policy = None
        self._ind_qf = ind_qf
        self._opponent_policy = opponent_policy
        self._prior = Serializable.clone(
            self._opponent_policy, name="prior_{}".format(self._agent_id)
        )

        self._target_policy = Serializable.clone(
            policy, name="target_policy_agent_{}".format(self._agent_id)
        )
        self._target_qf = Serializable.clone(
            qf, name="target_qf_agent_{}".format(self._agent_id)
        )

        self._actor_optimizer = policy_optimizer
        self._critic_optimizer = qf_optimizer
        self._opponent_policy_optimizer = opponent_policy_optimizer

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber
        self._gamma = gamma
        self._reward_scale = reward_scale
        self._gradient_clipping = gradient_clipping
        self._train_step = 0
        self._exploration_interval = 10
        self._exploration_status = True

        self.required_experiences = [
            "observation",
            "actions",
            "rewards",
            "next_observations",
            "opponent_actions",
            "terminals",
            "annealing",
            "recent_observations",
            "recent_opponent_actions",
        ]

        super(PR2Agent, self).__init__(
            observation_space,
            action_space,
            policy,
            qf,
            replay_buffer,
            train_sequence_length=train_sequence_length,
            name=name,
        )
Exemplo n.º 7
0
    def __init__(
        self,
        env_specs,
        main_policy,
        opponent_policy,
        prior_policy,
        opponent_prior_policy,
        qf,
        replay_buffer,
        k=3,
        mu=0,
        policy_optimizer=tf.optimizers.Adam(),
        qf_optimizer=tf.optimizers.Adam(),
        opponent_policy_optimizer=tf.optimizers.Adam(10e-3),
        prior_optimizer=tf.optimizers.Adam(10e-3),
        exploration_strategy=None,
        target_update_tau=0.01,
        target_update_period=1,
        td_errors_loss_fn=None,
        gamma=0.95,
        reward_scale=1.0,
        gradient_clipping=None,
        train_sequence_length=None,
        name="PR2K",
        agent_id=-1,
    ):
        self._Serializable__initialize(locals())
        self._agent_id = agent_id
        self._env_specs = env_specs

        if self._agent_id >= 0:
            observation_space = self._env_specs.observation_space[
                self._agent_id]
            action_space = self._env_specs.action_space[self._agent_id]
            opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim(
                self._agent_id)
            opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim(
                self._agent_id)
        else:
            observation_space = self._env_specs.observation_space
            action_space = self._env_specs.action_space

        self._exploration_strategy = exploration_strategy
        self._target_policy = None
        self._mu = mu
        self._k = k
        self._opponent_policy = opponent_policy
        self._prior_policy = prior_policy
        self._opponent_prior_policy = opponent_prior_policy

        policy = LevelKPolicy(
            main_policy=main_policy,
            secondary_policy=opponent_policy,
            prior_policy=prior_policy,
            secondary_prior_policy=opponent_prior_policy,
        )
        # self._prior = Serializable.clone(self._opponent_policy, name='prior_{}'.format(self._agent_id))

        self._target_qf = Serializable.clone(qf,
                                             name="target_qf_agent_{}".format(
                                                 self._agent_id))

        self._actor_optimizer = policy_optimizer
        self._critic_optimizer = qf_optimizer
        self._opponent_policy_optimizer = opponent_policy_optimizer
        self._prior_optimizer = prior_optimizer

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber
        self._gamma = gamma
        self._reward_scale = reward_scale
        self._gradient_clipping = gradient_clipping
        self._train_step = 0
        self._exploration_interval = 10
        self._exploration_status = True

        self.required_experiences = [
            "observation",
            "actions",
            "rewards",
            "next_observations",
            "opponent_actions",
            "terminals",
            "annealing",
            "recent_observations",
            "recent_opponent_actions",
        ]

        super(PR2KSoftAgent, self).__init__(
            observation_space,
            action_space,
            policy,
            qf,
            replay_buffer,
            train_sequence_length=train_sequence_length,
            name=name,
        )
Exemplo n.º 8
0
    def __init__(
        self,
        env_specs,
        policy,
        qf,
        replay_buffer,
        opponent_policy,
        policy_optimizer=tf.optimizers.Adam(0.01),
        qf_optimizer=tf.optimizers.Adam(0.01),
        opponent_policy_optimizer=tf.optimizers.Adam(0.01),
        opponent_prior_optimizer=tf.optimizers.Adam(0.01),
        exploration_strategy=None,
        target_update_tau=0.01,
        target_update_period=1,
        td_errors_loss_fn=None,
        gamma=0.95,
        reward_scale=1.0,
        gradient_clipping=None,
        train_sequence_length=None,
        name="ROMMEO",
        agent_id=-1,
        uniform=False,
        custom_b=False,
        bi=1.0,
        bj=1.0,
    ):
        self._Serializable__initialize(locals())
        self._agent_id = agent_id
        self._env_specs = env_specs
        self._uniform = uniform
        self._custom_b = custom_b
        self._bj = tf.constant(bj, dtype=tf.float32)
        self._bi = tf.constant(bi, dtype=tf.float32)

        if self._agent_id >= 0:
            observation_space = self._env_specs.observation_space[self._agent_id]
            action_space = self._env_specs.action_space[self._agent_id]
            opponent_observation_flat_dim = self._env_specs.observation_space.opponent_flat_dim(
                self._agent_id
            )
            opponent_action_flat_dim = self._env_specs.action_space.opponent_flat_dim(
                self._agent_id
            )
        else:
            observation_space = self._env_specs.observation_space
            action_space = self._env_specs.action_space

        self._exploration_strategy = exploration_strategy
        self._target_policy = None

        self._opponent_policy = opponent_policy
        self._prior = Serializable.clone(
            self._opponent_policy, name="prior_{}".format(self._agent_id), repara=False
        )

        self._target_qf = Serializable.clone(
            qf, name="target_qf_agent_{}".format(self._agent_id)
        )

        self._opponent_policy_optimizer = opponent_policy_optimizer
        self._opponent_prior_optimizer = opponent_prior_optimizer
        self._actor_optimizer = policy_optimizer
        self._critic_optimizer = qf_optimizer

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._td_errors_loss_fn = td_errors_loss_fn or tf.losses.Huber
        self._gamma = gamma
        self._reward_scale = reward_scale
        self._gradient_clipping = gradient_clipping
        self._train_step = 0
        self._exploration_interval = 10
        self._exploration_status = True

        self.required_experiences = [
            "observation",
            "actions",
            "rewards",
            "next_observations",
            "opponent_actions",
            "terminals",
            "annealing",
            "recent_observations",
            "recent_opponent_actions",
        ]

        super(ROMMEOAgent, self).__init__(
            observation_space,
            action_space,
            policy,
            qf,
            replay_buffer,
            train_sequence_length=train_sequence_length,
            name=name,
        )