示例#1
0
 def _init_distribution(conditions, **kwargs):
     loc, scale = conditions["loc"], conditions["scale"]
     return tfd.Gumbel(loc=loc, scale=scale, **kwargs)
 def __init__(self, temp):
     super().__init__()
     self.temp = temp
     self.gumbel = distributions.Gumbel(0, 1)
示例#3
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            hidden_units={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert visual_sources == 0, 'HIRO doesn\'t support visual inputs.'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.s_dim,
            dtype=np.float32)
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.s_dim - self.fn_goal_dim

        self.high_noise = rls.ClippedNormalActionNoise(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            bound=self.high_scale / 2)
        self.low_noise = rls.ClippedNormalActionNoise(mu=np.zeros(self.a_dim),
                                                      sigma=1.0 *
                                                      np.ones(self.a_dim),
                                                      bound=0.5)

        _high_actor_net = lambda: rls.actor_dpg(self.s_dim, self.sub_goal_dim,
                                                hidden_units['high_actor'])
        if self.is_continuous:
            _low_actor_net = lambda: rls.actor_dpg(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
        else:
            _low_actor_net = lambda: rls.actor_discrete(
                self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                    'low_actor'])
            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.high_actor = _high_actor_net()
        self.high_actor_target = _high_actor_net()
        self.low_actor = _low_actor_net()
        self.low_actor_target = _low_actor_net()

        _high_critic_net = lambda: rls.critic_q_one(
            self.s_dim, self.sub_goal_dim, hidden_units['high_critic'])
        _low_critic_net = lambda: rls.critic_q_one(
            self.s_dim + self.sub_goal_dim, self.a_dim, hidden_units[
                'low_critic'])

        self.high_critic = DoubleQ(_high_critic_net)
        self.high_critic_target = DoubleQ(_high_critic_net)
        self.low_critic = DoubleQ(_low_critic_net)
        self.low_critic_target = DoubleQ(_low_critic_net)

        self.update_target_net_weights(
            self.low_actor_target.weights + self.low_critic_target.weights +
            self.high_actor_target.weights + self.high_critic_target.weights,
            self.low_actor.weights + self.low_critic.weights +
            self.high_actor.weights + self.high_critic.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.model_recorder(
            dict(high_actor=self.high_actor,
                 high_critic=self.high_critic,
                 low_actor=self.low_actor,
                 low_critic=self.low_critic,
                 low_actor_optimizer=self.low_actor_optimizer,
                 low_critic_optimizer=self.low_critic_optimizer,
                 high_actor_optimizer=self.high_actor_optimizer,
                 high_critic_optimizer=self.high_critic_optimizer))

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)
示例#4
0
 def _base_dist(self, mu: TensorLike, beta: TensorLike, *args, **kwargs):
     return tfd.Gumbel(loc=mu, scale=beta, *args, **kwargs)
示例#5
0
    def __init__(
            self,
            envspec,
            ployak=0.995,
            high_scale=1.0,
            reward_scale=1.0,
            sample_g_nums=100,
            sub_goal_steps=10,
            fn_goal_dim=0,
            intrinsic_reward_mode='os',
            high_batch_size=256,
            high_buffer_size=100000,
            low_batch_size=8,
            low_buffer_size=10000,
            high_actor_lr=1.0e-4,
            high_critic_lr=1.0e-3,
            low_actor_lr=1.0e-4,
            low_critic_lr=1.0e-3,
            network_settings={
                'high_actor': [64, 64],
                'high_critic': [64, 64],
                'low_actor': [64, 64],
                'low_critic': [64, 64]
            },
            **kwargs):
        assert not envspec.obs_spec.has_visual_observation, 'HIRO doesn\'t support visual inputs.'
        super().__init__(envspec=envspec, **kwargs)

        self.concat_vector_dim = self.obs_spec.total_vector_dim
        self.data_high = ExperienceReplay(high_batch_size, high_buffer_size)
        self.data_low = ExperienceReplay(low_batch_size, low_buffer_size)

        self.ployak = ployak
        self.reward_scale = reward_scale
        self.fn_goal_dim = fn_goal_dim
        self.sample_g_nums = sample_g_nums
        self.sub_goal_steps = sub_goal_steps
        self.sub_goal_dim = self.concat_vector_dim - self.fn_goal_dim
        self.high_scale = np.array(
            high_scale if isinstance(high_scale, list) else [high_scale] *
            self.sub_goal_dim,
            dtype=np.float32)

        self.high_noised_action = ClippedNormalNoisedAction(
            mu=np.zeros(self.sub_goal_dim),
            sigma=self.high_scale * np.ones(self.sub_goal_dim),
            action_bound=self.high_scale,
            noise_bound=self.high_scale / 2)
        self.low_noised_action = ClippedNormalNoisedAction(
            mu=np.zeros(self.a_dim),
            sigma=1.0 * np.ones(self.a_dim),
            noise_bound=0.5)

        def _create_high_ac_net(name):
            return ADoubleCNetwork(
                name=name,
                policy_net_type=OutputNetworkType.ACTOR_DPG,
                policy_net_kwargs=dict(
                    vector_dim=self.concat_vector_dim,
                    output_shape=self.sub_goal_dim,
                    network_settings=network_settings['high_actor']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(
                    vector_dim=self.concat_vector_dim,
                    action_dim=self.sub_goal_dim,
                    network_settings=network_settings['high_critic']))

        self.high_ac_net = _create_high_ac_net('high_ac_net')
        self.high_ac_target_net = _create_high_ac_net('high_ac_target_net')

        if self.is_continuous:

            def _create_low_ac_net(name):
                return ADoubleCNetwork(
                    name=name,
                    policy_net_type=OutputNetworkType.ACTOR_DPG,
                    policy_net_kwargs=dict(
                        vector_dim=self.concat_vector_dim + self.sub_goal_dim,
                        output_shape=self.a_dim,
                        network_settings=network_settings['low_actor']),
                    value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                    value_net_kwargs=dict(
                        vector_dim=self.concat_vector_dim + self.sub_goal_dim,
                        action_dim=self.a_dim,
                        network_settings=network_settings['low_critic']))
        else:

            def _create_low_ac_net(name):
                return ADoubleCNetwork(
                    name=name,
                    policy_net_type=OutputNetworkType.ACTOR_DCT,
                    policy_net_kwargs=dict(
                        vector_dim=self.concat_vector_dim + self.sub_goal_dim,
                        output_shape=self.a_dim,
                        network_settings=network_settings['low_actor']),
                    value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                    value_net_kwargs=dict(
                        vector_dim=self.concat_vector_dim + self.sub_goal_dim,
                        action_dim=self.a_dim,
                        network_settings=network_settings['low_critic']))

            self.gumbel_dist = tfd.Gumbel(0, 1)

        self.low_ac_net = _create_low_ac_net('low_ac_net')
        self.low_ac_target_net = _create_low_ac_net('low_ac_target_net')

        update_target_net_weights(
            self.low_ac_target_net.weights + self.high_ac_target_net.weights,
            self.low_ac_net.weights + self.high_ac_net.weights)

        self.low_actor_lr, self.low_critic_lr = map(
            self.init_lr, [low_actor_lr, low_critic_lr])
        self.high_actor_lr, self.high_critic_lr = map(
            self.init_lr, [high_actor_lr, high_critic_lr])
        self.low_actor_optimizer, self.low_critic_optimizer = map(
            self.init_optimizer, [self.low_actor_lr, self.low_critic_lr])
        self.high_actor_optimizer, self.high_critic_optimizer = map(
            self.init_optimizer, [self.high_actor_lr, self.high_critic_lr])

        self.counts = 0
        self._high_s = [[] for _ in range(self.n_agents)]
        self._noop_subgoal = np.random.uniform(-self.high_scale,
                                               self.high_scale,
                                               size=(self.n_agents,
                                                     self.sub_goal_dim))
        self.get_ir = self.generate_ir_func(mode=intrinsic_reward_mode)

        self._worker_params_dict.update(self.high_ac_net._policy_models)
        self._worker_params_dict.update(self.low_ac_net._policy_models)

        self._all_params_dict.update(self.high_ac_net._all_models)
        self._all_params_dict.update(self.low_ac_net._all_models)
        self._all_params_dict.update(
            low_actor_optimizer=self.low_actor_optimizer,
            low_critic_optimizer=self.low_critic_optimizer,
            high_actor_optimizer=self.high_actor_optimizer,
            high_critic_optimizer=self.high_critic_optimizer)

        self._model_post_process()
示例#6
0
	def call(self, prob):
		gumbel_dist   = tfp.Gumbel(0, 1)
		gumbel_sample = gumbel_dist.sample(K.shape(prob))
		categ_sample  = K.exp((K.log(prob)+gumbel_sample)/self.temp)
		categ_sample  = categ_sample/K.sum(categ_sample, axis=-1, keepdims=True)
		return categ_sample