def __init__(self, name: str, shape: List[int], eps=1e-8, verbose=False): # batch_size x ... super().__init__() self.name = name self.shape = shape self.eps = eps self._verbose = verbose with self.scope: self.op_mean = nn.Parameter(tf.zeros(shape, dtype=tf.float32), name='mean', trainable=False) self.op_std = nn.Parameter(tf.ones(shape, dtype=tf.float32), name='std', trainable=False) self.op_n = nn.Parameter(tf.zeros([], dtype=tf.int64), name='n', trainable=False)
def __init__(self, nin, nf, rf, stride, padding='VALID', init_scale=1.0): super().__init__() self.strides = [1, stride, stride, 1] self.padding = padding w_shape = [rf, rf, nin, nf] b_shape = [1, 1, 1, nf] self.w = nn.Parameter(ortho_initializer(init_scale)(w_shape, np.float32), dtype=tf.float32, name="w") self.b = nn.Parameter(tf.constant_initializer(0.0)(b_shape), dtype=tf.float32, name="b")
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizer: GaussianNormalizer, init_std=1.): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes self.init_std = init_std self.normalizer = normalizer with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') self.op_actions_ = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions') layers = [] # note that the placeholder has size 105. all_sizes = [dim_state, *self.hidden_sizes] for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])): layers.append(nn.Linear(in_features, out_features, weight_initializer=normc_initializer(1))) layers.append(nn.Tanh()) layers.append(nn.Linear(all_sizes[-1], dim_action, weight_initializer=normc_initializer(0.01))) self.net = nn.Sequential(*layers) self.op_log_std = nn.Parameter( tf.constant(np.log(self.init_std), shape=[self.dim_action], dtype=tf.float32), name='log_std') self.distribution = self(self.op_states) self.op_actions = self.distribution.sample() self.op_actions_mean = self.distribution.mean() self.op_actions_std = self.distribution.stddev() self.op_nlls_ = -self.distribution.log_prob(self.op_actions_).reduce_sum(axis=1)
def __init__(self, qfns: List[lz.rl.BaseNNQFunction], policy: lz.rl.BaseNNPolicy, dim_state: int, dim_action: int, *, alpha): super().__init__() self.qfns = qfns self.qfns_target = [qfn.copy() for qfn in qfns] self.policy = policy self.dim_action = dim_action with self.scope: self.op_states = tf.placeholder(tf.float32, [None, dim_state]) self.op_actions = tf.placeholder(tf.float32, [None, dim_action]) self.op_rewards = tf.placeholder(tf.float32, [None]) self.op_next_states = tf.placeholder(tf.float32, [None, dim_state]) self.op_dones = tf.placeholder(tf.float32, [None]) if alpha: self.auto_entropy = False self.op_alpha = tf.constant(alpha, dtype=tf.float32) else: self.auto_entropy = True self.log_alpha = nn.Parameter(0.0, name='alpha', dtype=tf.float32) self.op_alpha = tf.exp(self.log_alpha) self.op_qfn_losses, self.op_train_qfn = self.train_qfn( self.op_states, self.op_actions, self.op_rewards, self.op_next_states, self.op_dones) with tf.control_dependencies([self.op_train_qfn]): self.op_train_policy, self.op_train_alpha = self.train_policy(self.op_states) self.op_update_targets = self.update_targets() self._n_updates = 0
def __init__(self, x, n_total_blocks): super().__init__() std = np.sqrt(2. / x / n_total_blocks) self.bias1a = nn.Parameter(tf.zeros(1), name='bias1a') self.fc1 = nn.Linear(x, x, bias=False, weight_initializer=tf.initializers.random_normal( 0, stddev=std)) self.bias1b = nn.Parameter(tf.zeros(1), name='bias1b') self.relu = nn.ReLU() self.bias2a = nn.Parameter(tf.zeros(1), name='bias2a') self.fc2 = nn.Linear(x, x, bias=False, weight_initializer=tf.initializers.zeros()) self.scale = nn.Parameter(tf.ones(1), name='scale') self.bias2b = nn.Parameter(tf.zeros(1), name='bias2b')
def __init__(self, dim_state: int, dim_action: int, actor: Actor, critic: Critic, init_alpha: float, gamma: float, target_entropy: float, actor_lr: float, critic_lr: float, alpha_lr: float, tau: float, actor_update_freq: int, target_update_freq: int, learn_alpha: bool): super().__init__() self.actor = actor self.critic = critic self.critic_target = self.critic.clone() self.gamma = gamma self.target_entropy = target_entropy self.actor_lr = actor_lr self.critic_lr = critic_lr self.alpha_lr = alpha_lr self.tau = tau self.actor_update_freq = actor_update_freq self.target_update_freq = target_update_freq self.learn_alpha = learn_alpha with self.scope: self.op_states = tf.placeholder(tf.float32, [None, dim_state], 'states') self.op_actions = tf.placeholder(tf.float32, [None, dim_action], 'actions') self.op_next_states = tf.placeholder(tf.float32, [None, dim_state], 'next_states') self.op_rewards = tf.placeholder(tf.float32, [None], 'rewards') self.op_terminals = tf.placeholder(tf.float32, [None], 'terminals') self.op_tau = tf.placeholder(tf.float32, [], 'tau') self.op_log_alpha = nn.Parameter(tf.log(init_alpha), name="log_alpha") target_params, source_params = self.critic_target.parameters(), self.critic.parameters() self.op_update_critic_target = tf.group( *[tf.assign(v_t, self.op_tau * v_t + (1 - self.op_tau) * v_s) for v_t, v_s in zip(target_params, source_params)]) self.op_actor_loss, self.op_critic_loss, self.op_alpha_loss, self.op_entropy, self.op_q_value, \ self.op_dist_mean, self.op_dist_std, self.op_a1, self.op_a2, self.op_log_prob_a1 = self( states=self.op_states, actions=self.op_actions, next_states=self.op_next_states, rewards=self.op_rewards, terminals=self.op_terminals, log_alpha=self.op_log_alpha ) actor_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr) critic_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=self.alpha_lr) self.op_actor_train = actor_optimizer.minimize(self.op_actor_loss, var_list=self.actor.parameters()) self.op_critic_train = critic_optimizer.minimize(self.op_critic_loss, var_list=self.critic.parameters()) self.op_alpha_train = alpha_optimizer.minimize(self.op_alpha_loss, var_list=[self.op_log_alpha]) self.op_actor_norm = tf.global_norm(self.actor.parameters()) self.op_critic_norm = tf.global_norm(self.critic.parameters()) self.op_alpha = tf.exp(self.op_log_alpha) self.iterations = 0
def __init__(self, nin, nh, init_scale=1., init_bias=0.): super().__init__() self.w = nn.Parameter( ortho_initializer(init_scale)([nin, nh], np.float32), "w") self.b = nn.Parameter(tf.constant_initializer(init_bias)([nh]), "b")
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizers: Normalizers, output_diff=False, init_std=1.): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes self.output_diff = output_diff self.init_std = init_std self.normalizers = normalizers with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') self.op_actions = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions') self.op_next_states_ = tf.placeholder(tf.float32, shape=[None, dim_state], name='next_states') layers = [] all_sizes = [dim_state + dim_action, *self.hidden_sizes] for i, (in_features, out_features) in enumerate( zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.Tanh()) layers.append(FCLayer(all_sizes[-1], dim_state, init_scale=0.01)) self.net = nn.Sequential(*layers) self.op_log_std = nn.Parameter(tf.constant(np.log(self.init_std), shape=[self.dim_state], dtype=tf.float32), name='log_std') self.distribution = self(self.op_states, self.op_actions) self.op_next_states_std = self.distribution.stddev() if self.output_diff: self.op_next_states_mean = self.op_states + self.normalizers.diff( self.distribution.mean(), inverse=True) self.op_next_states = self.op_states + self.normalizers.diff( tf.clip_by_value( self.distribution.sample(), self.distribution.mean() - 3 * self.distribution.stddev(), self.distribution.mean() + 3 * self.distribution.stddev()), inverse=True) else: self.op_next_states_mean = self.normalizers.state( self.distribution.mean(), inverse=True) self.op_next_states = self.normalizers.state(tf.clip_by_value( self.distribution.sample(), self.distribution.mean() - 3 * self.distribution.stddev(), self.distribution.mean() + 3 * self.distribution.stddev()), inverse=True) self.op_mse_loss = tf.reduce_mean( tf.square( self.normalizers.state(self.op_next_states_) - self.normalizers.state(self.op_next_states_mean), ))