def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizer: GaussianNormalizer, init_std=1.): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes self.init_std = init_std self.normalizer = normalizer with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') self.op_actions_ = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions') layers = [] all_sizes = [dim_state, *self.hidden_sizes] for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.Tanh()) layers.append(FCLayer(all_sizes[-1], dim_action, init_scale=0.01)) self.net = nn.Sequential(*layers) self.op_log_std = nn.Parameter( tf.constant(np.log(self.init_std), shape=[self.dim_action], dtype=tf.float32), name='log_std') self.distribution = self(self.op_states) self.op_actions = self.distribution.sample() self.op_actions_mean = self.distribution.mean() self.op_actions_std = self.distribution.stddev() self.op_mse_loss = tf.reduce_mean(tf.square(self.op_actions_ - self.op_actions_mean)) self.op_nlls_ = -self.distribution.log_prob(self.op_actions_).reduce_sum(axis=1)
def __init__(self, state_spec, action_spec, hidden_sizes=(64, 64)): super().__init__() self.state_spec = state_spec self.action_spec = action_spec self.hidden_sizes = hidden_sizes self.op_states = tf.placeholder(state_spec.dtype, [None, *state_spec.shape], 'states') self.op_actions_ = tf.placeholder(action_spec.dtype, [None, *action_spec.shape], 'actions') all_sizes = [state_spec.shape[0], *hidden_sizes] layer = [] for nin, nh in zip(all_sizes[:-1], all_sizes[1:]): layer.append(FCLayer(nin, nh, init_scale=np.sqrt(2))) layer.append(nn.Tanh()) self.mlp_net = nn.Sequential(*layer) self.pi_net = FCLayer(all_sizes[-1], action_spec.n, init_scale=0.01) self.q_net = FCLayer(all_sizes[-1], action_spec.n) pi_logits, q_values, = self.forward(self.op_states) self.pd = CategoricalPd(pi_logits) self.op_actions = self.pd.sample() self.op_actions_mean = self.pd.mode() self.op_mus = tf.nn.softmax(pi_logits) self.op_v_values = tf.reduce_sum(self.op_mus * q_values, axis=-1) self.op_nlls = self.pd.neglogp(self.op_actions) self.op_q_values = get_by_index(q_values, self.op_actions) self.op_q_values_ = get_by_index(q_values, self.op_actions_)
def __init__(self, dim_state, dim_action, hidden_sizes): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state]) self.op_actions = tf.placeholder(tf.float32, shape=[None, dim_action]) layers = [] all_sizes = [dim_state + dim_action, *self.hidden_sizes] for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.ReLU()) layers.append(FCLayer(all_sizes[-1], 1)) self.net1 = nn.Sequential(*layers) layers = [] all_sizes = [dim_state + dim_action, *self.hidden_sizes] for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.ReLU()) layers.append(FCLayer(all_sizes[-1], 1)) self.net2 = nn.Sequential(*layers) self.op_q1, self.op_q2 = self.forward(self.op_states, self.op_actions)
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizer: Normalizers, save_normalizer=False): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes # this avoid to save normalizer into self.state_dict self.state_process_fn = lambda states_: normalizer.state(states_) self.action_process_fn = lambda actions_: actions_ if save_normalizer: self.normalizer = normalizer with self.scope: self.op_states = tf.placeholder(tf.float32, [None, dim_state], "state") self.op_actions = tf.placeholder(tf.float32, [None, dim_action], "action") layers = [] all_sizes = [dim_state + dim_action, *self.hidden_sizes] for i, (in_features, out_features) in enumerate( zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.ReLU()) layers.append(FCLayer(all_sizes[-1], 1)) self.net = nn.Sequential(*layers) self.op_logits = self(self.op_states, self.op_actions) self.op_rewards = -tf.log(1 - tf.nn.sigmoid(self.op_logits) + 1e-6)
def __init__(self, state_spec, action_spec): super().__init__() self.state_spec = state_spec self.action_spec = action_spec self.op_states = tf.placeholder(state_spec.dtype, [None, *state_spec.shape], 'states') self.op_actions_ = tf.placeholder(action_spec.dtype, [None, *action_spec.shape], 'actions') self.cnn_net = NatureCNN(state_spec.shape[-1]) self.pi_net = FCLayer(nin=512, nh=self.action_spec.n, init_scale=0.01) self.q_net = FCLayer(nin=512, nh=self.action_spec.n) pi_logits, q_values, = self.forward(self.op_states) self.pd = CategoricalPd(pi_logits) self.op_actions = self.pd.sample() self.op_actions_mean = self.pd.mode() self.op_mus = tf.nn.softmax(pi_logits) self.op_v_values = tf.reduce_sum(self.op_mus * q_values, axis=-1) self.op_nlls = self.pd.neglogp(self.op_actions) self.op_q_values = get_by_index(q_values, self.op_actions) self.op_q_values_ = get_by_index(q_values, self.op_actions_)
def __init__(self, dim_state, hidden_sizes, normalizer=None): super().__init__() self.hidden_sizes = hidden_sizes layers = [] all_sizes = [dim_state, *self.hidden_sizes] for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.Tanh()) layers.append(FCLayer(all_sizes[-1], 1)) self.net = nn.Sequential(*layers) self.normalizer = normalizer self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state]) self.op_values = self.forward(self.op_states)
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], state_process_fn, action_process_fn, activ_fn='none'): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes # this avoid to save normalizer into self.state_dict self.state_process_fn = state_process_fn self.action_process_fn = action_process_fn with self.scope: self.op_states = tf.placeholder(tf.float32, [None, dim_state], "state") self.op_actions = tf.placeholder(tf.float32, [None, dim_action], "action") self.op_next_states = tf.placeholder(tf.float32, [None, dim_state], "next_state") layers = [] all_sizes = [dim_state * 2 + dim_action, *self.hidden_sizes] for i, (in_features, out_features) in enumerate( zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.ReLU()) layers.append(FCLayer(all_sizes[-1], 1)) if activ_fn == 'none': pass elif activ_fn == 'sigmoid': layers.append(nn.Sigmoid()) elif activ_fn == 'tanh': layers.append(nn.Tanh()) else: raise ValueError('%s is not supported' % activ_fn) self.net = nn.Sequential(*layers) self.op_logits = self(self.op_states, self.op_actions, self.op_next_states) self.op_rewards = -tf.log(1 - tf.nn.sigmoid(self.op_logits) + 1e-6)
def __init__(self, dim_state, dim_action, hidden_sizes: List[int]): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') layers = [] all_sizes = [dim_state, *self.hidden_sizes] for i, (in_features, out_features) in enumerate( zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.ReLU()) layers.append(FCLayer(all_sizes[-1], dim_action, init_scale=0.01)) layers.append(nn.Tanh()) self.net = nn.Sequential(*layers) self.op_actions = self(self.op_states)
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int]): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') self.op_actions_ = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions') layers = [] all_sizes = [dim_state, *self.hidden_sizes] for i, (in_features, out_features) in enumerate(zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.ReLU()) layers.append(FCLayer(all_sizes[-1], dim_action*2)) self.net = nn.Sequential(*layers) self.op_actions, self.op_log_density, pd, self.op_dist_mean, self.op_dist_log_std = self(self.op_states) self.op_actions_mean = tf.tanh(self.op_dist_mean) pi_ = tf.atanh(clip_but_pass_gradient(self.op_actions_, -1+EPS, 1-EPS)) log_prob_pi_ = pd.log_prob(pi_).reduce_sum(axis=1) log_prob_pi_ -= tf.reduce_sum(tf.log(1 - self.op_actions_ ** 2 + EPS), axis=1) self.op_log_density_ = log_prob_pi_
def __init__(self, dim_state: int, dim_action: int, hidden_sizes: List[int], normalizers: Normalizers, output_diff=False, init_std=1.): super().__init__() self.dim_state = dim_state self.dim_action = dim_action self.hidden_sizes = hidden_sizes self.output_diff = output_diff self.init_std = init_std self.normalizers = normalizers with self.scope: self.op_states = tf.placeholder(tf.float32, shape=[None, dim_state], name='states') self.op_actions = tf.placeholder(tf.float32, shape=[None, dim_action], name='actions') self.op_next_states_ = tf.placeholder(tf.float32, shape=[None, dim_state], name='next_states') layers = [] all_sizes = [dim_state + dim_action, *self.hidden_sizes] for i, (in_features, out_features) in enumerate( zip(all_sizes[:-1], all_sizes[1:])): layers.append(FCLayer(in_features, out_features)) layers.append(nn.Tanh()) layers.append(FCLayer(all_sizes[-1], dim_state, init_scale=0.01)) self.net = nn.Sequential(*layers) self.op_log_std = nn.Parameter(tf.constant(np.log(self.init_std), shape=[self.dim_state], dtype=tf.float32), name='log_std') self.distribution = self(self.op_states, self.op_actions) self.op_next_states_std = self.distribution.stddev() if self.output_diff: self.op_next_states_mean = self.op_states + self.normalizers.diff( self.distribution.mean(), inverse=True) self.op_next_states = self.op_states + self.normalizers.diff( tf.clip_by_value( self.distribution.sample(), self.distribution.mean() - 3 * self.distribution.stddev(), self.distribution.mean() + 3 * self.distribution.stddev()), inverse=True) else: self.op_next_states_mean = self.normalizers.state( self.distribution.mean(), inverse=True) self.op_next_states = self.normalizers.state(tf.clip_by_value( self.distribution.sample(), self.distribution.mean() - 3 * self.distribution.stddev(), self.distribution.mean() + 3 * self.distribution.stddev()), inverse=True) self.op_mse_loss = tf.reduce_mean( tf.square( self.normalizers.state(self.op_next_states_) - self.normalizers.state(self.op_next_states_mean), ))