Пример #1
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 cross_norm=False):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      cross_norm: Whether to fit cross norm critic.
    """
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]

        if cross_norm:
            beta_1 = 0.0
        else:
            beta_1 = 0.9

        self.actor = policies.DeterministicPolicy(state_dim, action_spec, 0.3)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,
                                                        beta_1=beta_1)

        if cross_norm:
            self.critic_learner = critic.CrossNormCriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau)
        else:
            self.critic_learner = critic.CriticLearner(state_dim,
                                                       action_spec.shape[0],
                                                       critic_lr, discount,
                                                       tau)
Пример #2
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 target_update_period=1,
                 target_entropy=0.0,
                 cross_norm=False,
                 pcl_actor_update=False):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_update_period: Target network update period.
      target_entropy: Target entropy.
      cross_norm: Whether to fit cross norm critic.
      pcl_actor_update: Whether to use PCL actor update.
    """
        actor_kwargs = {}
        critic_kwargs = {}

        if len(observation_spec.shape) == 3:  # Image observations.
            # DRQ encoder params.
            # https://github.com/denisyarats/drq/blob/master/config.yaml#L73
            state_dim = 50

            # Actor and critic encoders share conv weights only.
            conv_stack = ConvStack(observation_spec.shape)

            actor_kwargs['encoder'] = ImageEncoder(conv_stack,
                                                   state_dim,
                                                   bprop_conv_stack=False)
            actor_kwargs['hidden_dims'] = (1024, 1024)

            critic_kwargs['encoder'] = ImageEncoder(conv_stack,
                                                    state_dim,
                                                    bprop_conv_stack=True)
            critic_kwargs['hidden_dims'] = (1024, 1024)

            if not cross_norm:
                # Note: the target critic does not share any weights.
                critic_kwargs['encoder_target'] = ImageEncoder(
                    ConvStack(observation_spec.shape),
                    state_dim,
                    bprop_conv_stack=True)

        else:  # 1D state observations.
            assert len(observation_spec.shape) == 1
            state_dim = observation_spec.shape[0]

        if cross_norm:
            beta_1 = 0.0
        else:
            beta_1 = 0.9

        self.actor = policies.DiagGuassianPolicy(state_dim, action_spec,
                                                 **actor_kwargs)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,
                                                        beta_1=beta_1)

        self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr,
                                                        beta_1=beta_1)

        if cross_norm:
            assert 'encoder_target' not in critic_kwargs
            self.critic_learner = critic.CrossNormCriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau,
                **critic_kwargs)
        else:
            self.critic_learner = critic.CriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau,
                target_update_period, **critic_kwargs)

        self.target_entropy = target_entropy
        self.discount = discount

        self.pcl_actor_update = pcl_actor_update