def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): """ Beta distribution. Args: shape: Action shape. min_value: Minimum value of continuous actions. max_value: Maximum value of continuous actions. alpha: Optional distribution bias for the alpha value. beta: Optional distribution bias for the beta value. """ assert min_value is None or max_value > min_value self.shape = shape self.min_value = min_value self.max_value = max_value action_size = util.prod(self.shape) self.alpha = Linear(size=action_size, bias=alpha, scope='alpha') self.beta = Linear(size=action_size, bias=beta, scope='beta') super(Beta, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): """ Beta distribution used for continuous actions. In particular, the Beta distribution allows to bound action values with min and max values. Args: shape: Shape of actions min_value: Min value of all actions for the given shape max_value: Max value of all actions for the given shape alpha: Concentration parameter of the Beta distribution beta: Concentration parameter of the Beta distribution """ assert min_value is None or max_value > min_value self.shape = shape self.min_value = min_value self.max_value = max_value action_size = util.prod(self.shape) self.alpha = Linear(size=action_size, bias=alpha, scope='alpha') self.beta = Linear(size=action_size, bias=beta, scope='beta') super(Beta, self).__init__(scope, summary_labels)
def __init__(self, shape, mean=0.0, log_stddev=0.0, scope='gaussian', summary_labels=()): """ Categorical distribution. Args: shape: Action shape. mean: Optional distribution bias for the mean. log_stddev: Optional distribution bias for the standard deviation. """ self.shape = shape action_size = util.prod(self.shape) self.mean = Linear(size=action_size, bias=mean, scope='mean', summary_labels=summary_labels) self.log_stddev = Linear(size=action_size, bias=log_stddev, scope='log-stddev', summary_labels=summary_labels) super(Gaussian, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
def __init__(self, shape, num_actions, probabilities=None, scope='categorical', summary_labels=()): """ Categorical distribution. Args: shape: Action shape. num_actions: Number of discrete action alternatives. probabilities: Optional distribution bias. """ self.num_actions = num_actions action_size = util.prod(shape) * self.num_actions if probabilities is None: logits = 0.0 else: logits = [ log(prob) for _ in range(util.prod(shape)) for prob in probabilities ] self.logits = Linear(size=action_size, bias=logits, scope='logits') super(Categorical, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
class NetworkBaseline(Baseline): """ Baseline based on a TensorForce network, used when parameters are shared between the value function and the baseline. """ def __init__(self, network_spec, scope='network-baseline', summary_labels=()): """ Network baseline. Args: network_spec: Network specification dict """ with tf.name_scope(name=scope): self.network = Network.from_spec(spec=network_spec) assert len(self.network.internal_inputs()) == 0 self.linear = Linear(size=1, bias=0.0, scope='prediction') super(NetworkBaseline, self).__init__(scope, summary_labels) def tf_predict(self, states): embedding = self.network.apply(x=states) prediction = self.linear.apply(x=embedding) return tf.squeeze(input=prediction, axis=1) def tf_regularization_loss(self): """ Creates the TensorFlow operations for the baseline regularization loss. Returns: Regularization loss tensor """ if super(NetworkBaseline, self).tf_regularization_loss() is None: losses = list() else: losses = [super(NetworkBaseline, self).tf_regularization_loss()] if self.network.get_regularization_loss() is not None: losses.append(self.network.get_regularization_loss()) if self.linear.get_regularization_loss() is not None: losses.append(self.linear.get_regularization_loss()) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): baseline_variables = super(NetworkBaseline, self).get_variables( include_non_trainable=include_non_trainable ) network_variables = self.network.get_variables(include_non_trainable=include_non_trainable) layer_variables = self.linear.get_variables(include_non_trainable=include_non_trainable) return baseline_variables + network_variables + layer_variables
def __init__(self, shape, probability=0.5, scope='bernoulli', summary_labels=()): self.shape = shape action_size = util.prod(self.shape) with tf.name_scope(name=scope): self.logit = Linear(size=action_size, bias=log(probability), scope='logit') super(Bernoulli, self).__init__(scope, summary_labels)
def setup_components_and_tf_funcs(self, custom_getter=None): super(QNAFModel, self).setup_components_and_tf_funcs(custom_getter) self.state_values = dict() self.l_entries = dict() for name, action in self.actions_spec.items(): num_action = util.prod(action['shape']) self.state_values[name] = Linear(size=num_action, scope='state-value') self.l_entries[name] = Linear(size=(num_action * (num_action - 1) // 2), scope='l-entries')
def __init__(self, shape, mean=0.0, log_stddev=0.0, scope='gaussian', summary_labels=()): self.shape = shape action_size = util.prod(self.shape) with tf.name_scope(name=scope): self.mean = Linear(size=action_size, bias=mean, scope='mean') self.log_stddev = Linear(size=action_size, bias=log_stddev, scope='log-stddev') super(Gaussian, self).__init__(scope, summary_labels)
def initialize(self, custom_getter): super(QNAFModel, self).initialize(custom_getter) self.state_values = dict() self.l_entries = dict() for name, action in self.actions_spec.items(): num_action = util.prod(action['shape']) self.state_values[name] = Linear(size=num_action, scope='state-value') self.l_entries[name] = Linear(size=(num_action * (num_action - 1) // 2), scope='l-entries')
class NetworkBaseline(Baseline): """ Baseline based on a TensorForce network, used when parameters are shared between the value function and the baseline. """ def __init__(self, network, scope='network-baseline', summary_labels=()): """ Network baseline. Args: network_spec: Network specification dict """ self.network = Network.from_spec( spec=network, kwargs=dict(summary_labels=summary_labels) ) assert len(self.network.internals_spec()) == 0 self.linear = Linear(size=1, bias=0.0, scope='prediction', summary_labels=summary_labels) super(NetworkBaseline, self).__init__(scope=scope, summary_labels=summary_labels) def tf_predict(self, states, internals, update): embedding = self.network.apply(x=states, internals=internals, update=update) prediction = self.linear.apply(x=embedding) return tf.squeeze(input=prediction, axis=1) def tf_regularization_loss(self): regularization_loss = super(NetworkBaseline, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.network.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.linear.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_nontrainable=False): baseline_variables = super(NetworkBaseline, self).get_variables(include_nontrainable=include_nontrainable) network_variables = self.network.get_variables(include_nontrainable=include_nontrainable) layer_variables = self.linear.get_variables(include_nontrainable=include_nontrainable) return baseline_variables + network_variables + layer_variables
def __init__(self, shape, num_actions, probabilities=None, scope='categorical', summary_labels=()): self.shape = shape self.num_actions = num_actions if probabilities is None: logits = 0.0 else: logits = [log(prob) for _ in range(util.prod(shape)) for prob in probabilities] action_size = util.prod(self.shape) * self.num_actions self.logits = Linear(size=action_size, bias=logits, scope='logits') super(Categorical, self).__init__(scope, summary_labels)
def __init__(self, network_spec, scope='network-baseline', summary_labels=()): """ Network baseline. Args: network_spec: Network specification dict """ with tf.name_scope(name=scope): self.network = Network.from_spec(spec=network_spec) assert len(self.network.internal_inputs()) == 0 self.linear = Linear(size=1, bias=0.0, scope='prediction') super(NetworkBaseline, self).__init__(scope, summary_labels)
def __init__(self, network, scope='network-baseline', summary_labels=()): """ Network baseline. Args: network_spec: Network specification dict """ self.network = Network.from_spec( spec=network, kwargs=dict(summary_labels=summary_labels)) assert len(self.network.internals_spec()) == 0 self.linear = Linear(size=1, bias=0.0, scope='prediction') super(NetworkBaseline, self).__init__(scope=scope, summary_labels=summary_labels)
def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): """ Aggregated baseline. Args: baselines: Dict of per-state baseline specification dicts """ self.baselines = dict() for name in sorted(baselines): self.baselines[name] = Baseline.from_spec( spec=baselines[name], kwargs=dict(summary_labels=summary_labels)) self.linear = Linear(size=1, bias=0.0, scope='prediction', summary_labels=summary_labels) super(AggregatedBaseline, self).__init__(scope, summary_labels)
def __init__(self, states_spec, actions_spec, network_spec, config): if any(action['type'] != 'float' or 'min_value' in action or 'max_value' in action for action in actions_spec.values()): raise TensorForceError("Only unconstrained float actions valid for NAFModel.") with tf.name_scope(name=config.scope): self.state_values = dict() self.l_entries = dict() for name, action in actions_spec.items(): num_action = util.prod(action['shape']) self.state_values[name] = Linear(size=num_action, scope=(name + 'state-value')) self.l_entries[name] = Linear(size=(num_action * (num_action - 1) // 2), scope=(name + '-l-entries')) super(QNAFModel, self).__init__( states_spec=states_spec, actions_spec=actions_spec, network_spec=network_spec, config=config )
def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): """ Aggregated baseline. Args: baselines: Dict of per-state baseline specification dicts """ with tf.name_scope(name=scope): self.baselines = dict() for name, baseline_spec in baselines.items(): with tf.name_scope(name=(name + '-baseline')): self.baselines[name] = Baseline.from_spec( spec=baseline_spec, kwargs=dict(summary_labels=summary_labels) ) self.linear = Linear(size=1, bias=0.0, scope='prediction') super(AggregatedBaseline, self).__init__(scope, summary_labels)
def __init__(self, shape, probability=0.5, scope='bernoulli', summary_labels=()): """ Bernoulli distribution. Args: shape: Action shape. probability: Optional distribution bias. """ self.shape = shape action_size = util.prod(self.shape) self.logit = Linear(size=action_size, bias=log(probability), scope='logit') super(Bernoulli, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
def __init__(self, scope='ddpg-critic-network', summary_labels=(), size_t0=400, size_t1=300): super(DDPGCriticNetwork, self).__init__(scope=scope, summary_labels=summary_labels) self.t0l = Linear(size=size_t0, scope='linear0') self.t0b = TFLayer(layer='batch_normalization', scope='batchnorm0', center=True, scale=True) self.t0n = Nonlinearity(name='relu', scope='relu0') self.t1l = Linear(size=size_t1, scope='linear1') self.t1b = TFLayer(layer='batch_normalization', scope='batchnorm1', center=True, scale=True) self.t1n = Nonlinearity(name='relu', scope='relu1') self.t2d = Dense(size=1, activation='tanh', scope='dense0', weights=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) self.add_layer(self.t0l) self.add_layer(self.t0b) self.add_layer(self.t0n) self.add_layer(self.t1l) self.add_layer(self.t1b) self.add_layer(self.t1n) self.add_layer(self.t2d)
class Bernoulli(Distribution): """ Bernoulli distribution for binary actions. """ def __init__(self, shape, probability=0.5, scope='bernoulli', summary_labels=()): self.shape = shape action_size = util.prod(self.shape) with tf.name_scope(name=scope): self.logit = Linear(size=action_size, bias=log(probability), scope='logit') super(Bernoulli, self).__init__(scope, summary_labels) def tf_parameterize(self, x): # Flat logit logit = self.logit.apply(x=x) # Reshape logit to action shape shape = (-1, ) + self.shape logit = tf.reshape(tensor=logit, shape=shape) #TODO rename state_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) # Min epsilon probability for numerical stability probability = tf.clip_by_value(t=probability, clip_value_min=util.epsilon, clip_value_max=(1.0 - util.epsilon)) # "Normalized" logits true_logit = tf.log(x=probability) false_logit = tf.log(x=(1.0 - probability)) return true_logit, false_logit, probability, state_value def state_value(self, distr_params): _, _, _, state_value = distr_params return state_value def state_action_value(self, distr_params, action): true_logit, false_logit, _, state_value = distr_params logit = tf.where(condition=action, x=true_logit, y=false_logit) return logit + state_value def tf_sample(self, distr_params, deterministic): _, _, probability, _ = distr_params # Deterministic: true if >= 0.5 definite = tf.greater_equal(x=probability, y=0.5) # Non-deterministic: sample true if >= uniform distribution uniform = tf.random_uniform(shape=tf.shape(probability)) sampled = tf.greater_equal(x=probability, y=uniform) return tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): true_logit, false_logit, _, _ = distr_params return tf.where(condition=action, x=true_logit, y=false_logit) def tf_entropy(self, distr_params): true_logit, false_logit, probability, _ = distr_params return -probability * true_logit - (1.0 - probability) * false_logit def tf_kl_divergence(self, distr_params1, distr_params2): true_logit1, false_logit1, probability1, _ = distr_params1 true_logit2, false_logit2, _, _ = distr_params2 true_log_prob_ratio = true_logit1 - true_logit2 false_log_prob_ratio = false_logit1 - false_logit2 return probability1 * true_log_prob_ratio + ( 1.0 - probability1) * false_log_prob_ratio def tf_regularization_loss(self): regularization_loss = super(Bernoulli, self).tf_regularization_loss() if super(Bernoulli, self).tf_regularization_loss() is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.logit.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): distribution_variables = super( Bernoulli, self).get_variables(include_non_trainable=include_non_trainable) logit_variables = self.logit.get_variables( include_non_trainable=include_non_trainable) return distribution_variables + logit_variables def get_summaries(self): distribution_summaries = super(Bernoulli, self).get_summaries() logit_summaries = self.logit.get_summaries() return distribution_summaries + logit_summaries
class Categorical(Distribution): """ Categorical distribution, for discrete actions. """ def __init__(self, shape, num_actions, probabilities=None, scope='categorical', summary_labels=()): """ Categorical distribution. Args: shape: Action shape. num_actions: Number of discrete action alternatives. probabilities: Optional distribution bias. """ self.num_actions = num_actions action_size = util.prod(shape) * self.num_actions if probabilities is None: logits = 0.0 else: logits = [ log(prob) for _ in range(util.prod(shape)) for prob in probabilities ] self.logits = Linear(size=action_size, bias=logits, scope='logits') super(Categorical, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) def tf_parameterize(self, x): # Flat logits logits = self.logits.apply(x=x) # Reshape logits to action shape shape = (-1, ) + self.shape + (self.num_actions, ) logits = tf.reshape(tensor=logits, shape=shape) # !!! state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=logits, axis=-1) # Min epsilon probability for numerical stability probabilities = tf.maximum(x=probabilities, y=util.epsilon) # "Normalized" logits logits = tf.log(x=probabilities) return logits, probabilities, state_value def state_value(self, distr_params): _, _, state_value = distr_params return state_value def state_action_value(self, distr_params, action=None): logits, _, state_value = distr_params if action is None: state_value = tf.expand_dims(input=state_value, axis=-1) else: one_hot = tf.one_hot(indices=action, depth=self.num_actions) logits = tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1) return state_value + logits def tf_sample(self, distr_params, deterministic): logits, _, _ = distr_params # Deterministic: maximum likelihood action definite = tf.argmax(input=logits, axis=-1, output_type=util.tf_dtype('int')) # Non-deterministic: sample action using Gumbel distribution uniform_distribution = tf.random_uniform(shape=tf.shape(input=logits), minval=util.epsilon, maxval=(1.0 - util.epsilon)) gumbel_distribution = -tf.log(x=-tf.log(x=uniform_distribution)) sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1, output_type=util.tf_dtype('int')) return tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): logits, _, _ = distr_params one_hot = tf.one_hot(indices=action, depth=self.num_actions) return tf.reduce_sum(input_tensor=(logits * one_hot), axis=-1) def tf_entropy(self, distr_params): logits, probabilities, _ = distr_params return -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1) def tf_kl_divergence(self, distr_params1, distr_params2): logits1, probabilities1, _ = distr_params1 logits2, _, _ = distr_params2 log_prob_ratio = logits1 - logits2 return tf.reduce_sum(input_tensor=(probabilities1 * log_prob_ratio), axis=-1) def tf_regularization_loss(self): regularization_loss = super(Categorical, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.logits.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_nontrainable=False): distribution_variables = super( Categorical, self).get_variables(include_nontrainable=include_nontrainable) logits_variables = self.logits.get_variables( include_nontrainable=include_nontrainable) return distribution_variables + logits_variables def get_summaries(self): distribution_summaries = super(Categorical, self).get_summaries() logits_summaries = self.logits.get_summaries() return distribution_summaries + logits_summaries
class Beta(Distribution): """ Beta distribution, for bounded continuous actions. """ def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): """ Beta distribution. Args: shape: Action shape. min_value: Minimum value of continuous actions. max_value: Maximum value of continuous actions. alpha: Optional distribution bias for the alpha value. beta: Optional distribution bias for the beta value. """ assert min_value is None or max_value > min_value self.shape = shape self.min_value = min_value self.max_value = max_value action_size = util.prod(self.shape) self.alpha = Linear(size=action_size, bias=alpha, scope='alpha') self.beta = Linear(size=action_size, bias=beta, scope='beta') super(Beta, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels) def tf_parameterize(self, x): # Softplus to ensure alpha and beta >= 1 # epsilon < 1.0, hence negative log_eps = log(util.epsilon) alpha = self.alpha.apply(x=x) alpha = tf.clip_by_value(t=alpha, clip_value_min=log_eps, clip_value_max=-log_eps) alpha = tf.log(x=(tf.exp(x=alpha) + 1.0)) + 1.0 beta = self.beta.apply(x=x) beta = tf.clip_by_value(t=beta, clip_value_min=log_eps, clip_value_max=-log_eps) beta = tf.log(x=(tf.exp(x=beta) + 1.0)) + 1.0 shape = (-1, ) + self.shape alpha = tf.reshape(tensor=alpha, shape=shape) beta = tf.reshape(tensor=beta, shape=shape) alpha_beta = tf.maximum(x=(alpha + beta), y=util.epsilon) log_norm = tf.lgamma(x=alpha) + tf.lgamma(x=beta) - tf.lgamma( x=alpha_beta) return alpha, beta, alpha_beta, log_norm def tf_sample(self, distr_params, deterministic): alpha, beta, alpha_beta, _ = distr_params # Deterministic: mean as action definite = beta / alpha_beta # Non-deterministic: sample action using gamma distribution alpha_sample = tf.random_gamma(shape=(), alpha=alpha) beta_sample = tf.random_gamma(shape=(), alpha=beta) sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=util.epsilon) return self.min_value + (self.max_value - self.min_value) * \ tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): alpha, beta, _, log_norm = distr_params action = (action - self.min_value) / (self.max_value - self.min_value) action = tf.minimum(x=action, y=(1.0 - util.epsilon)) return (beta - 1.0) * tf.log(x=tf.maximum(x=action, y=util.epsilon)) + \ (alpha - 1.0) * tf.log1p(x=-action) - log_norm def tf_entropy(self, distr_params): alpha, beta, alpha_beta, log_norm = distr_params return log_norm - (beta - 1.0) * tf.digamma(x=beta) - (alpha - 1.0) * tf.digamma(x=alpha) + \ (alpha_beta - 2.0) * tf.digamma(x=alpha_beta) def tf_kl_divergence(self, distr_params1, distr_params2): alpha1, beta1, alpha_beta1, log_norm1 = distr_params1 alpha2, beta2, alpha_beta2, log_norm2 = distr_params2 return log_norm2 - log_norm1 - tf.digamma(x=beta1) * (beta2 - beta1) - \ tf.digamma(x=alpha1) * (alpha2 - alpha1) + tf.digamma(x=alpha_beta1) * (alpha_beta2 - alpha_beta1) def tf_regularization_loss(self): regularization_loss = super(Beta, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.alpha.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.beta.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_nontrainable=False): distribution_variables = super( Beta, self).get_variables(include_nontrainable=include_nontrainable) alpha_variables = self.alpha.get_variables( include_nontrainable=include_nontrainable) beta_variables = self.beta.get_variables( include_nontrainable=include_nontrainable) return distribution_variables + alpha_variables + beta_variables
class Gaussian(Distribution): """ Gaussian distribution, for unbounded continuous actions. """ def __init__(self, shape, mean=0.0, log_stddev=0.0, scope='gaussian', summary_labels=()): self.shape = shape action_size = util.prod(self.shape) with tf.name_scope(name=scope): self.mean = Linear(size=action_size, bias=mean, scope='mean') self.log_stddev = Linear(size=action_size, bias=log_stddev, scope='log-stddev') super(Gaussian, self).__init__(scope, summary_labels) def tf_parameters(self, x): # Flat mean and log standard deviation mean = self.mean.apply(x=x) log_stddev = self.log_stddev.apply(x=x) # Reshape mean and log stddev to action shape shape = (-1,) + self.shape mean = tf.reshape(tensor=mean, shape=shape) log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Clip log stddev for numerical stability log_eps = log(util.epsilon) # epsilon < 1.0, hence negative log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_eps, clip_value_max=-log_eps) # Standard deviation stddev = tf.exp(x=log_stddev) return mean, stddev, log_stddev def state_value(self, distr_params): _, _, log_stddev = distr_params return -log_stddev - 0.5 * log(2.0 * pi) def state_action_value(self, distr_params, action): mean, stddev, log_stddev = distr_params sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon) return -0.5 * sq_mean_distance / sq_stddev - 2.0 * log_stddev - log(2.0 * pi) def tf_sample(self, distr_params, deterministic): mean, stddev, _ = distr_params # Deterministic: mean as action definite = mean # Non-deterministic: sample action using default normal distribution normal_distribution = tf.random_normal(shape=tf.shape(input=mean)) sampled = mean + stddev * normal_distribution return tf.where(condition=deterministic, x=definite, y=sampled) def tf_log_probability(self, distr_params, action): mean, stddev, log_stddev = distr_params sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=util.epsilon) return -0.5 * sq_mean_distance / sq_stddev - log_stddev - 0.5 * log(2.0 * pi) def tf_entropy(self, distr_params): _, _, log_stddev = distr_params return log_stddev + 0.5 * log(2.0 * pi * e) def tf_kl_divergence(self, distr_params1, distr_params2): mean1, stddev1, log_stddev1 = distr_params1 mean2, stddev2, log_stddev2 = distr_params2 log_stddev_ratio = log_stddev2 - log_stddev1 sq_mean_distance = tf.square(x=(mean1 - mean2)) sq_stddev1 = tf.square(x=stddev1) sq_stddev2 = tf.maximum(x=tf.square(x=stddev2), y=util.epsilon) return log_stddev_ratio + 0.5 * (sq_stddev1 + sq_mean_distance) / sq_stddev2 - 0.5 def tf_regularization_loss(self): regularization_loss = super(Gaussian, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] regularization_loss = self.mean.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.log_stddev.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): distribution_variables = super(Gaussian, self).get_variables(include_non_trainable=include_non_trainable) mean_variables = self.mean.get_variables(include_non_trainable=include_non_trainable) log_stddev_variables = self.log_stddev.get_variables(include_non_trainable=include_non_trainable) return distribution_variables + mean_variables + log_stddev_variables
class AggregatedBaseline(Baseline): """ Baseline which aggregates per-state baselines. """ def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): """ Aggregated baseline. Args: baselines: Dict of per-state baseline specification dicts """ with tf.name_scope(name=scope): self.baselines = dict() for name, baseline_spec in baselines.items(): with tf.name_scope(name=(name + '-baseline')): self.baselines[name] = Baseline.from_spec( spec=baseline_spec, kwargs=dict(summary_labels=summary_labels) ) self.linear = Linear(size=1, bias=0.0, scope='prediction') super(AggregatedBaseline, self).__init__(scope, summary_labels) def tf_predict(self, states): predictions = list() for name, state in states.items(): prediction = self.baselines[name].predict(states=state) predictions.append(prediction) predictions = tf.stack(values=predictions, axis=1) prediction = self.linear.apply(x=predictions) return tf.squeeze(input=prediction, axis=1) def tf_regularization_loss(self): regularization_loss = super(AggregatedBaseline, self).tf_regularization_loss() if regularization_loss is None: losses = list() else: losses = [regularization_loss] for baseline in self.baselines.values(): regularization_loss = baseline.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) regularization_loss = self.linear.regularization_loss() if regularization_loss is not None: losses.append(regularization_loss) if len(losses) > 0: return tf.add_n(inputs=losses) else: return None def get_variables(self, include_non_trainable=False): baseline_variables = super(AggregatedBaseline, self).get_variables( include_non_trainable=include_non_trainable ) baselines_variables = [ variable for name in sorted(self.baselines) for variable in self.baselines[name].get_variables(include_non_trainable=include_non_trainable) ] linear_variables = self.linear.get_variables(include_non_trainable=include_non_trainable) return baseline_variables + baselines_variables + linear_variables
class DDPGCriticNetwork(LayerBasedNetwork): def __init__(self, scope='ddpg-critic-network', summary_labels=(), size_t0=400, size_t1=300): super(DDPGCriticNetwork, self).__init__(scope=scope, summary_labels=summary_labels) self.t0l = Linear(size=size_t0, scope='linear0') self.t0b = TFLayer(layer='batch_normalization', scope='batchnorm0', center=True, scale=True) self.t0n = Nonlinearity(name='relu', scope='relu0') self.t1l = Linear(size=size_t1, scope='linear1') self.t1b = TFLayer(layer='batch_normalization', scope='batchnorm1', center=True, scale=True) self.t1n = Nonlinearity(name='relu', scope='relu1') self.t2d = Dense(size=1, activation='tanh', scope='dense0', weights=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) self.add_layer(self.t0l) self.add_layer(self.t0b) self.add_layer(self.t0n) self.add_layer(self.t1l) self.add_layer(self.t1b) self.add_layer(self.t1n) self.add_layer(self.t2d) def tf_apply(self, x, internals, update, return_internals=False): assert x['states'], x['actions'] if isinstance(x['states'], dict): if len(x['states']) != 1: raise TensorForceError( 'DDPG critic network must have only one state input, but {} given.' .format(len(x['states']))) x_states = x['states'][next(iter(sorted(x['states'])))] else: x_states = x['states'] if isinstance(x['actions'], dict): if len(x['actions']) != 1: raise TensorForceError( 'DDPG critic network must have only one action input, but {} given.' .format(len(x['actions']))) x_actions = x['actions'][next(iter(sorted(x['actions'])))] else: x_actions = x['actions'] out = self.t0l.apply(x=x_states, update=update) out = self.t0b.apply(x=out, update=update) out = self.t0n.apply(x=out, update=update) out = self.t1l.apply(x=tf.concat([out, x_actions], axis=1), update=update) out = self.t1b.apply(x=out, update=update) out = self.t1n.apply(x=out, update=update) out = self.t2d.apply(x=out, update=update) # Remove last dimension because we only return Q values for one state and action # out = tf.squeeze(out) if return_internals: # Todo: Internals management return out, None else: return out