def __init__(self, observation_spec, action_spec, actor_lr = 3e-4, critic_lr = 3e-4, discount = 0.99, tau = 0.005, f = 'bin_max', temperature = 0.05): """Creates networks. Args: observation_spec: environment observation spec. action_spec: Action spec. actor_lr: Actor learning rate. critic_lr: Critic learning rate. discount: MDP discount. tau: Soft target update parameter. f: Advantage transformation. temperature: Temperature parameter. """ assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] self.actor = policies.DiagGuassianPolicy(state_dim, action_spec) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.critic_learner = critic.CriticLearner(state_dim, action_spec.shape[0], critic_lr, discount, tau) self.f = f self.temperature = temperature
def __init__(self, observation_spec, action_spec, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, discount=0.99, tau=0.005, target_update_period=1, target_entropy=0.0, use_soft_critic=False): """Creates networks. Args: observation_spec: environment observation spec. action_spec: Action spec. actor_lr: Actor learning rate. critic_lr: Critic learning rate. alpha_lr: Temperature learning rate. discount: MDP discount. tau: Soft target update parameter. target_update_period: Target network update period. target_entropy: Target entropy. use_soft_critic: Whether to use soft critic representation. """ assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] self.actor = policies.DiagGuassianPolicy(state_dim, action_spec) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True) self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr) self.target_entropy = target_entropy self.discount = discount self.tau = tau self.target_update_period = target_update_period self.value = critic.CriticNet(state_dim) self.value_target = critic.CriticNet(state_dim) critic.soft_update(self.value, self.value_target, tau=1.0) self.value_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr) if use_soft_critic: self.critic = critic.SoftCritic(state_dim, action_spec) else: action_dim = action_spec.shape[0] self.critic = critic.Critic(state_dim, action_dim) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr)
def __init__(self, state_dim, action_spec, hidden_dims=(256, 256)): """Creates networks. Args: state_dim: State size. action_spec: Action specification. hidden_dims: List of hidden dimensions. """ super().__init__() self.value = CriticNet(state_dim, action_dim=None, hidden_dims=hidden_dims) self.advantage = policies.DiagGuassianPolicy(state_dim, action_spec, hidden_dims=hidden_dims) self.log_alpha = tf.Variable(0.0, dtype=tf.float32, trainable=True)
def __init__(self, observation_spec, action_spec, actor_lr=1e-4, critic_lr=3e-4, alpha_lr=1e-4, discount=0.99, tau=0.005, target_entropy=0.0): """Creates networks. Args: observation_spec: environment observation spec. action_spec: Action spec. actor_lr: Actor learning rate. critic_lr: Critic learning rate. alpha_lr: Temperature learning rate. discount: MDP discount. tau: Soft target update parameter. target_entropy: Target entropy. """ assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] beta_1 = 0.0 self.actor = policies.DiagGuassianPolicy(state_dim, action_spec) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr, beta_1=beta_1) self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True) self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr, beta_1=beta_1) self.target_entropy = target_entropy self.discount = discount self.tau = tau action_dim = action_spec.shape[0] self.critic = critic.Critic(state_dim, action_dim) self.critic_target = critic.Critic(state_dim, action_dim) critic.soft_update(self.critic, self.critic_target, tau=1.0) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr, beta_1=beta_1)
def __init__(self, observation_spec, action_spec, mixture=False, env_name=''): """BC class init. Args: observation_spec: observation space action_spec: action space mixture: use a mixture model? env_name: name of env Returns: None """ del env_name assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] self.action_spec = action_spec if mixture: self.policy = policies.MixtureGuassianPolicy( state_dim, action_spec) else: self.policy = policies.DiagGuassianPolicy(state_dim, action_spec) boundaries = [800_000, 900_000] values = [1e-3, 1e-4, 1e-5] learning_rate_fn = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries, values) self.optimizer = tf.keras.optimizers.Adam( learning_rate=learning_rate_fn) self.log_alpha = tf.Variable(tf.math.log(1.0), trainable=True) self.alpha_optimizer = tf.keras.optimizers.Adam( learning_rate=learning_rate_fn) self.target_entropy = -action_spec.shape[0]
def __init__(self, observation_spec, action_spec, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, discount=0.99, tau=0.005, target_entropy=0.0, f_reg=1.0, reward_bonus=5.0, num_augmentations=1, env_name='', batch_size=256): """Creates networks. Args: observation_spec: environment observation spec. action_spec: Action spec. actor_lr: Actor learning rate. critic_lr: Critic learning rate. alpha_lr: Temperature learning rate. discount: MDP discount. tau: Soft target update parameter. target_entropy: Target entropy. f_reg: Critic regularization weight. reward_bonus: Bonus added to the rewards. num_augmentations: Number of random crops env_name: Env name batch_size: batch size """ del num_augmentations, env_name assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] self.batch_size = batch_size hidden_dims = (256, 256, 256) self.actor = policies.DiagGuassianPolicy(state_dim, action_spec, hidden_dims=hidden_dims) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.log_alpha = tf.Variable(tf.math.log(1.0), trainable=True) self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr) self.target_entropy = target_entropy self.discount = discount self.tau = tau self.bc = behavioral_cloning.BehavioralCloning(observation_spec, action_spec, mixture=True) action_dim = action_spec.shape[0] self.critic = critic.Critic(state_dim, action_dim, hidden_dims=hidden_dims) self.critic_target = critic.Critic(state_dim, action_dim, hidden_dims=hidden_dims) critic.soft_update(self.critic, self.critic_target, tau=1.0) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=critic_lr) self.f_reg = f_reg self.reward_bonus = reward_bonus self.model_dict = { 'critic': self.critic, 'actor': self.actor, 'critic_target': self.critic_target, 'actor_optimizer': self.actor_optimizer, 'critic_optimizer': self.critic_optimizer, 'alpha_optimizer': self.alpha_optimizer }
def __init__(self, observation_spec, action_spec, actor_lr=3e-4, critic_lr=3e-4, alpha_lr=3e-4, discount=0.99, tau=0.005, target_update_period=1, target_entropy=0.0, cross_norm=False, pcl_actor_update=False): """Creates networks. Args: observation_spec: environment observation spec. action_spec: Action spec. actor_lr: Actor learning rate. critic_lr: Critic learning rate. alpha_lr: Temperature learning rate. discount: MDP discount. tau: Soft target update parameter. target_update_period: Target network update period. target_entropy: Target entropy. cross_norm: Whether to fit cross norm critic. pcl_actor_update: Whether to use PCL actor update. """ actor_kwargs = {} critic_kwargs = {} if len(observation_spec.shape) == 3: # Image observations. # DRQ encoder params. # https://github.com/denisyarats/drq/blob/master/config.yaml#L73 state_dim = 50 # Actor and critic encoders share conv weights only. conv_stack = ConvStack(observation_spec.shape) actor_kwargs['encoder'] = ImageEncoder(conv_stack, state_dim, bprop_conv_stack=False) actor_kwargs['hidden_dims'] = (1024, 1024) critic_kwargs['encoder'] = ImageEncoder(conv_stack, state_dim, bprop_conv_stack=True) critic_kwargs['hidden_dims'] = (1024, 1024) if not cross_norm: # Note: the target critic does not share any weights. critic_kwargs['encoder_target'] = ImageEncoder( ConvStack(observation_spec.shape), state_dim, bprop_conv_stack=True) else: # 1D state observations. assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] if cross_norm: beta_1 = 0.0 else: beta_1 = 0.9 self.actor = policies.DiagGuassianPolicy(state_dim, action_spec, **actor_kwargs) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr, beta_1=beta_1) self.log_alpha = tf.Variable(tf.math.log(0.1), trainable=True) self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=alpha_lr, beta_1=beta_1) if cross_norm: assert 'encoder_target' not in critic_kwargs self.critic_learner = critic.CrossNormCriticLearner( state_dim, action_spec.shape[0], critic_lr, discount, tau, **critic_kwargs) else: self.critic_learner = critic.CriticLearner( state_dim, action_spec.shape[0], critic_lr, discount, tau, target_update_period, **critic_kwargs) self.target_entropy = target_entropy self.discount = discount self.pcl_actor_update = pcl_actor_update
bc_pretraining_steps: Use BC loss instead of CQL loss for N steps. min_q_weight: CQL alpha. num_augmentations: Number of DrQ-style random crops rep_learn_keywords: Representation learning loss to add. batch_size: batch size """ del num_augmentations, rep_learn_keywords assert len(observation_spec.shape) == 1 state_dim = observation_spec.shape[0] self.batch_size = batch_size self.bc = None hidden_dims = (256, 256, 256) self.actor = policies.DiagGuassianPolicy(state_dim, action_spec, hidden_dims=hidden_dims) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.log_alpha = tf.Variable(tf.math.log(1.0), trainable=True) self.log_cql_alpha = self.log_alpha self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) action_dim = action_spec.shape[0] self.critic = critic.Critic(state_dim, action_dim, hidden_dims=hidden_dims) self.critic_target = critic.Critic(state_dim, action_dim, hidden_dims=hidden_dims) critic.soft_update(self.critic, self.critic_target, tau=1.0)