예제 #1
0
  def __init__(self,
               observation_spec,
               action_spec,
               actor_lr = 3e-4,
               critic_lr = 3e-4,
               discount = 0.99,
               tau = 0.005,
               f = 'bin_max',
               temperature = 0.05):
    """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      f: Advantage transformation.
      temperature: Temperature parameter.
    """
    assert len(observation_spec.shape) == 1
    state_dim = observation_spec.shape[0]

    self.actor = policies.DiagGuassianPolicy(state_dim, action_spec)
    self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

    self.critic_learner = critic.CriticLearner(state_dim, action_spec.shape[0],
                                               critic_lr, discount, tau)

    self.f = f
    self.temperature = temperature
예제 #2
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 target_update_period=1,
                 target_entropy=0.0,
                 use_soft_critic=False):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_update_period: Target network update period.
      target_entropy: Target entropy.
      use_soft_critic: Whether to use soft critic representation.
    """
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]

        self.actor = policies.DiagGuassianPolicy(state_dim, action_spec)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

        self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr)

        self.target_entropy = target_entropy
        self.discount = discount

        self.tau = tau
        self.target_update_period = target_update_period

        self.value = critic.CriticNet(state_dim)
        self.value_target = critic.CriticNet(state_dim)
        critic.soft_update(self.value, self.value_target, tau=1.0)
        self.value_optimizer = tf.keras.optimizers.Adam(
            learning_rate=critic_lr)

        if use_soft_critic:
            self.critic = critic.SoftCritic(state_dim, action_spec)
        else:
            action_dim = action_spec.shape[0]
            self.critic = critic.Critic(state_dim, action_dim)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=critic_lr)
예제 #3
0
    def __init__(self, state_dim, action_spec, hidden_dims=(256, 256)):
        """Creates networks.

    Args:
      state_dim: State size.
      action_spec: Action specification.
      hidden_dims: List of hidden dimensions.
    """
        super().__init__()
        self.value = CriticNet(state_dim,
                               action_dim=None,
                               hidden_dims=hidden_dims)

        self.advantage = policies.DiagGuassianPolicy(state_dim,
                                                     action_spec,
                                                     hidden_dims=hidden_dims)

        self.log_alpha = tf.Variable(0.0, dtype=tf.float32, trainable=True)
예제 #4
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=1e-4,
                 critic_lr=3e-4,
                 alpha_lr=1e-4,
                 discount=0.99,
                 tau=0.005,
                 target_entropy=0.0):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_entropy: Target entropy.
    """
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]

        beta_1 = 0.0
        self.actor = policies.DiagGuassianPolicy(state_dim, action_spec)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,
                                                        beta_1=beta_1)

        self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr,
                                                        beta_1=beta_1)

        self.target_entropy = target_entropy
        self.discount = discount
        self.tau = tau

        action_dim = action_spec.shape[0]
        self.critic = critic.Critic(state_dim, action_dim)
        self.critic_target = critic.Critic(state_dim, action_dim)
        critic.soft_update(self.critic, self.critic_target, tau=1.0)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=critic_lr, beta_1=beta_1)
예제 #5
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 mixture=False,
                 env_name=''):
        """BC class init.

    Args:
      observation_spec: observation space
      action_spec: action space
      mixture: use a mixture model?
      env_name: name of env
    Returns:
      None
    """
        del env_name
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]

        self.action_spec = action_spec
        if mixture:
            self.policy = policies.MixtureGuassianPolicy(
                state_dim, action_spec)
        else:
            self.policy = policies.DiagGuassianPolicy(state_dim, action_spec)

        boundaries = [800_000, 900_000]
        values = [1e-3, 1e-4, 1e-5]
        learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries, values)

        self.optimizer = tf.keras.optimizers.Adam(
            learning_rate=learning_rate_fn)

        self.log_alpha = tf.Variable(tf.math.log(1.0), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(
            learning_rate=learning_rate_fn)

        self.target_entropy = -action_spec.shape[0]
예제 #6
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 target_entropy=0.0,
                 f_reg=1.0,
                 reward_bonus=5.0,
                 num_augmentations=1,
                 env_name='',
                 batch_size=256):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_entropy: Target entropy.
      f_reg: Critic regularization weight.
      reward_bonus: Bonus added to the rewards.
      num_augmentations: Number of random crops
      env_name: Env name
      batch_size: batch size
    """
        del num_augmentations, env_name
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]
        self.batch_size = batch_size

        hidden_dims = (256, 256, 256)
        self.actor = policies.DiagGuassianPolicy(state_dim,
                                                 action_spec,
                                                 hidden_dims=hidden_dims)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

        self.log_alpha = tf.Variable(tf.math.log(1.0), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr)

        self.target_entropy = target_entropy
        self.discount = discount
        self.tau = tau

        self.bc = behavioral_cloning.BehavioralCloning(observation_spec,
                                                       action_spec,
                                                       mixture=True)

        action_dim = action_spec.shape[0]
        self.critic = critic.Critic(state_dim,
                                    action_dim,
                                    hidden_dims=hidden_dims)
        self.critic_target = critic.Critic(state_dim,
                                           action_dim,
                                           hidden_dims=hidden_dims)
        critic.soft_update(self.critic, self.critic_target, tau=1.0)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=critic_lr)

        self.f_reg = f_reg
        self.reward_bonus = reward_bonus

        self.model_dict = {
            'critic': self.critic,
            'actor': self.actor,
            'critic_target': self.critic_target,
            'actor_optimizer': self.actor_optimizer,
            'critic_optimizer': self.critic_optimizer,
            'alpha_optimizer': self.alpha_optimizer
        }
예제 #7
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_lr=3e-4,
                 critic_lr=3e-4,
                 alpha_lr=3e-4,
                 discount=0.99,
                 tau=0.005,
                 target_update_period=1,
                 target_entropy=0.0,
                 cross_norm=False,
                 pcl_actor_update=False):
        """Creates networks.

    Args:
      observation_spec: environment observation spec.
      action_spec: Action spec.
      actor_lr: Actor learning rate.
      critic_lr: Critic learning rate.
      alpha_lr: Temperature learning rate.
      discount: MDP discount.
      tau: Soft target update parameter.
      target_update_period: Target network update period.
      target_entropy: Target entropy.
      cross_norm: Whether to fit cross norm critic.
      pcl_actor_update: Whether to use PCL actor update.
    """
        actor_kwargs = {}
        critic_kwargs = {}

        if len(observation_spec.shape) == 3:  # Image observations.
            # DRQ encoder params.
            # https://github.com/denisyarats/drq/blob/master/config.yaml#L73
            state_dim = 50

            # Actor and critic encoders share conv weights only.
            conv_stack = ConvStack(observation_spec.shape)

            actor_kwargs['encoder'] = ImageEncoder(conv_stack,
                                                   state_dim,
                                                   bprop_conv_stack=False)
            actor_kwargs['hidden_dims'] = (1024, 1024)

            critic_kwargs['encoder'] = ImageEncoder(conv_stack,
                                                    state_dim,
                                                    bprop_conv_stack=True)
            critic_kwargs['hidden_dims'] = (1024, 1024)

            if not cross_norm:
                # Note: the target critic does not share any weights.
                critic_kwargs['encoder_target'] = ImageEncoder(
                    ConvStack(observation_spec.shape),
                    state_dim,
                    bprop_conv_stack=True)

        else:  # 1D state observations.
            assert len(observation_spec.shape) == 1
            state_dim = observation_spec.shape[0]

        if cross_norm:
            beta_1 = 0.0
        else:
            beta_1 = 0.9

        self.actor = policies.DiagGuassianPolicy(state_dim, action_spec,
                                                 **actor_kwargs)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,
                                                        beta_1=beta_1)

        self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr,
                                                        beta_1=beta_1)

        if cross_norm:
            assert 'encoder_target' not in critic_kwargs
            self.critic_learner = critic.CrossNormCriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau,
                **critic_kwargs)
        else:
            self.critic_learner = critic.CriticLearner(
                state_dim, action_spec.shape[0], critic_lr, discount, tau,
                target_update_period, **critic_kwargs)

        self.target_entropy = target_entropy
        self.discount = discount

        self.pcl_actor_update = pcl_actor_update
예제 #8
0
      bc_pretraining_steps: Use BC loss instead of CQL loss for N steps.
      min_q_weight: CQL alpha.
      num_augmentations: Number of DrQ-style random crops
      rep_learn_keywords: Representation learning loss to add.
      batch_size: batch size
    """
        del num_augmentations, rep_learn_keywords
        assert len(observation_spec.shape) == 1
        state_dim = observation_spec.shape[0]
        self.batch_size = batch_size

        self.bc = None

        hidden_dims = (256, 256, 256)
        self.actor = policies.DiagGuassianPolicy(state_dim,
                                                 action_spec,
                                                 hidden_dims=hidden_dims)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

        self.log_alpha = tf.Variable(tf.math.log(1.0), trainable=True)
        self.log_cql_alpha = self.log_alpha
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)

        action_dim = action_spec.shape[0]
        self.critic = critic.Critic(state_dim,
                                    action_dim,
                                    hidden_dims=hidden_dims)
        self.critic_target = critic.Critic(state_dim,
                                           action_dim,
                                           hidden_dims=hidden_dims)
        critic.soft_update(self.critic, self.critic_target, tau=1.0)