def sample(self, *, parameters, temperature): logits, probabilities, action_values = parameters.get( ('logits', 'probabilities', 'action_values')) # Distribution parameter summaries def fn_summary(): axis = range(self.action_spec.rank + 1) probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis) return [probs[n] for n in range(self.action_spec.num_values)] prefix = 'distributions/' + self.name + '-probability' names = [prefix + str(n) for n in range(self.action_spec.num_values)] dependencies = self.summary(label='distribution', name=names, data=fn_summary, step='timesteps') # Entropy summary def fn_summary(): entropy = -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1) return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')) one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: maximum likelihood action definite = tf.argmax(input=action_values, axis=-1) definite = tf_util.cast(x=definite, dtype='int') # Set logits to minimal value min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min) logits = logits / temperature logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=logits) # Non-deterministic: sample action using Gumbel distribution uniform_distribution = tf.random.uniform( shape=tf.shape(input=logits), minval=epsilon, maxval=(one - epsilon), dtype=tf_util.get_dtype(type='float')) gumbel_distribution = -tf.math.log( x=-tf.math.log(x=uniform_distribution)) sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1) sampled = tf_util.cast(x=sampled, dtype='int') with tf.control_dependencies(control_inputs=dependencies): return tf.where(condition=(temperature < epsilon), x=definite, y=sampled)
def fn_sample(): # Non-deterministic: sample action using gamma distribution alpha_sample = tf.random.gamma( shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float')) beta_sample = tf.random.gamma( shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float')) return beta_sample / (alpha_sample + beta_sample)
def sample(self, *, parameters, temperature): alpha, beta, alpha_beta, log_norm = parameters.get( ('alpha', 'beta', 'alpha_beta', 'log_norm') ) # Distribution parameter summaries def fn_summary(): return tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)), \ tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1)) prefix = 'distributions/' + self.name dependencies = self.summary( label='distribution', name=(prefix + '-alpha', prefix + '-beta'), data=fn_summary, step='timesteps' ) # Entropy summary def fn_summary(): one = tf_util.constant(value=1.0, dtype='float') digamma_alpha = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float' ) digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float') digamma_alpha_beta = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float' ) entropy = log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \ (alpha_beta - one - one) * digamma_alpha_beta return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps') ) epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: mean as action definite = beta / alpha_beta # Non-deterministic: sample action using gamma distribution alpha_sample = tf.random.gamma(shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float')) beta_sample = tf.random.gamma(shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float')) sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=epsilon) action = tf.where(condition=(temperature < epsilon), x=definite, y=sampled) min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') with tf.control_dependencies(control_inputs=dependencies): return min_value + (max_value - min_value) * action
def body(lengths, predecessor_indices, mask): previous_index = tf.math.mod(x=(predecessor_indices[:, :1] - one), y=capacity) predecessor_indices = tf.concat(values=(previous_index, predecessor_indices), axis=1) previous_terminal = tf.gather(params=self.buffers['terminal'], indices=previous_index) is_not_terminal = tf.math.logical_and( x=tf.math.logical_not(x=tf.math.greater(x=previous_terminal, y=zero)), y=mask[:, :1] ) mask = tf.concat(values=(is_not_terminal, mask), axis=1) is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1) zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros) return lengths, predecessor_indices, mask
def body(lengths, successor_indices, mask): current_index = successor_indices[:, -1:] current_terminal = tf.gather(params=self.buffers['terminal'], indices=current_index) is_not_terminal = tf.math.logical_and( x=tf.math.logical_not(x=tf.math.greater(x=current_terminal, y=zero)), y=mask[:, -1:] ) next_index = tf.math.mod(x=(current_index + one), y=capacity) successor_indices = tf.concat(values=(successor_indices, next_index), axis=1) mask = tf.concat(values=(mask, is_not_terminal), axis=1) is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1) zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros) return lengths, successor_indices, mask
def function(name, spec, a_value): advantage_value = a_value.apply(x=embedding) if spec.type == 'bool': shape = (-1, ) + spec.shape + (2, ) elif spec.type == 'int': shape = (-1, ) + spec.shape + (spec.num_values, ) advantage_value = tf.reshape(tensor=advantage_value, shape=shape) mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) shape = (-1, ) + tuple(1 for _ in range(spec.rank + 1)) _state_value = tf.reshape(tensor=state_value, shape=shape) action_value = _state_value + (advantage_value - mean) if spec.type == 'bool': return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) elif spec.type == 'int': if self.config.enable_int_action_masking: mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.reduce_max(input_tensor=action_value, axis=-1)
def fn_sample(): # Set logits to minimal value min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min) temp_logits = logits / tf.math.maximum(x=temperature, y=epsilon) temp_logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=temp_logits) # Non-deterministic: sample action using Gumbel distribution one = tf_util.constant(value=1.0, dtype='float') uniform_distribution = tf.random.uniform( shape=tf.shape(input=temp_logits), minval=epsilon, maxval=(one - epsilon), dtype=tf_util.get_dtype(type='float') ) # Second log numerically stable since log(1-eps) ~ -eps gumbel_distribution = -tf.math.log(x=-tf.math.log(x=uniform_distribution)) action = tf.math.argmax(input=(temp_logits + gumbel_distribution), axis=-1) return tf_util.cast(x=action, dtype='int')
def retrieve_episodes(self, *, n): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') # Check whether memory contains at least one episode assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal(x=self.episode_count, y=one)) # Get start and limit indices for randomly sampled n episodes with tf.control_dependencies(control_inputs=assertions): n = tf.math.minimum(x=n, y=self.episode_count) random_indices = tf.random.uniform( shape=(n, ), maxval=self.episode_count, dtype=tf_util.get_dtype(type='int')) # (Increment terminal of previous episode) starts = tf.gather(params=self.terminal_indices, indices=random_indices) + one limits = tf.gather(params=self.terminal_indices, indices=(random_indices + one)) + one # Correct limit index if smaller than start index limits = limits + tf.where( condition=(limits < starts), x=capacity, y=zero) # Random episode indices ranges indices = tf.ragged.range(starts=starts, limits=limits).values indices = tf.math.mod(x=indices, y=capacity) return indices
def retrieve_timesteps(self, *, n, past_horizon, future_horizon): one = tf_util.constant(value=1, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') # Check whether memory contains at least one valid timestep num_timesteps = tf.math.minimum(x=self.buffer_index, y=capacity) num_timesteps -= (past_horizon + future_horizon) num_timesteps = tf.math.maximum(x=num_timesteps, y=self.episode_count) # Check whether memory contains at least one timestep assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal(x=num_timesteps, y=one)) # Randomly sampled timestep indices with tf.control_dependencies(control_inputs=assertions): n = tf.math.minimum(x=n, y=num_timesteps) indices = tf.random.uniform(shape=(n, ), maxval=num_timesteps, dtype=tf_util.get_dtype(type='int')) indices = tf.math.mod(x=(self.buffer_index - one - indices - future_horizon), y=capacity) return indices
def function(name, spec, action_value): if spec.type == 'bool': def fn_summary(): axis = range(spec.rank + 1) values = tf.math.reduce_mean(input_tensor=action_value, axis=axis) return [values[0], values[1]] if name is None: names = ['action-values/true', 'action-values/false'] else: names = ['action-values/' + name + '-true', 'action-values/' + name + '-false'] dependencies = self.summary( label='action-value', name=names, data=fn_summary, step='timesteps' ) def fn_tracking(): return tf.math.reduce_mean(input_tensor=action_value, axis=0) if name is None: n = 'action-values' else: n = name + '-values' dependencies = self.track(label='action-value', name=n, data=fn_tracking) with tf.control_dependencies(control_inputs=dependencies): return (action_value[..., 0] > action_value[..., 1]) elif spec.type == 'int': def fn_summary(): axis = range(spec.rank + 1) values = tf.math.reduce_mean(input_tensor=action_value, axis=axis) return [values[n] for n in range(spec.num_values)] if name is None: prefix = 'action-values/action' else: prefix = 'action-values/' + name + '-action' names = [prefix + str(n) for n in range(spec.num_values)] dependencies = self.summary( label='action-value', name=names, data=fn_summary, step='timesteps' ) def fn_tracking(): return tf.math.reduce_mean(input_tensor=action_value, axis=0) if name is None: n = 'action-values' else: n = name + '-values' dependencies = self.track(label='action-value', name=n, data=fn_tracking) with tf.control_dependencies(control_inputs=dependencies): if self.config.enable_int_action_masking: mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.argmax(input=action_value, axis=-1, output_type=spec.tf_type())
def body(deltas, previous_perturbations): with tf.control_dependencies(control_inputs=deltas): perturbations = [ learning_rate * tf.random.normal(shape=tf_util.shape(x=variable), dtype=tf_util.get_dtype(type='float')) for variable in variables ] perturbation_deltas = [ pert - prev_pert for pert, prev_pert in zip( perturbations, previous_perturbations) ] assignments = list() for variable, delta in zip(variables, perturbation_deltas): assignments.append( variable.assign_add(delta=delta, read_value=False)) with tf.control_dependencies(control_inputs=assignments): perturbed_loss = fn_loss(**arguments.to_kwargs()) direction = tf.math.sign(x=(unperturbed_loss - perturbed_loss)) deltas = [ delta + direction * perturbation for delta, perturbation in zip(deltas, perturbations) ] return deltas, perturbations
def body(deltas, previous_perturbations): with tf.control_dependencies(control_inputs=deltas): perturbations = list() for variable in variables: perturbation = tf.random.normal(shape=variable.shape, dtype=variable.dtype) if variable.dtype == tf_util.get_dtype(type='float'): perturbations.append(learning_rate * perturbation) else: perturbations.append( tf.cast(x=learning_rate, dtype=variable.dtype) * perturbation) perturbation_deltas = [ pert - prev_pert for pert, prev_pert in zip( perturbations, previous_perturbations) ] assignments = list() for variable, delta in zip(variables, perturbation_deltas): assignments.append( variable.assign_add(delta=delta, read_value=False)) with tf.control_dependencies(control_inputs=assignments): perturbed_loss = fn_loss(**arguments.to_kwargs()) one_float = tf_util.constant(value=1.0, dtype='float') neg_one_float = tf_util.constant(value=-1.0, dtype='float') direction = tf.where( condition=(perturbed_loss < unperturbed_loss), x=one_float, y=neg_one_float) next_deltas = list() for variable, delta, perturbation in zip( variables, deltas, perturbations): if variable.dtype == tf_util.get_dtype(type='float'): next_deltas.append(delta + direction * perturbation) else: next_deltas.append( delta + tf.cast(x=direction, dtype=variable.dtype) * perturbation) return next_deltas, perturbations
def fn_sample(): # Non-deterministic: sample true if >= uniform distribution # Exp numerically stable since logits <= 0.0 e_true_logit = tf.math.exp(x=(true_logit / tf.math.maximum(x=temperature, y=epsilon))) e_false_logit = tf.math.exp(x=(false_logit / tf.math.maximum(x=temperature, y=epsilon))) probability = e_true_logit / tf.math.maximum(x=(e_true_logit + e_false_logit), y=epsilon) uniform = tf.random.uniform( shape=tf.shape(input=probability), dtype=tf_util.get_dtype(type='float') ) return tf.greater_equal(x=probability, y=uniform)
def reset(self): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') three = tf_util.constant(value=3, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity) def correct_terminal(): # Replace last observation terminal marker with abort terminal dependencies = list() two = tf_util.constant(value=2, dtype='int') sparse_delta = tf.IndexedSlices(values=two, indices=last_index) dependencies.append(self.buffers['terminal'].scatter_update( sparse_delta=sparse_delta)) sparse_delta = tf.IndexedSlices(values=last_index, indices=(self.episode_count + one)) dependencies.append( self.terminal_indices.scatter_update( sparse_delta=sparse_delta)) with tf.control_dependencies(control_inputs=dependencies): return self.episode_count.assign_add(delta=one, read_value=False) last_terminal = tf.gather(params=self.buffers['terminal'], indices=last_index) is_incorrect = tf.math.equal(x=last_terminal, y=three) corrected = tf.cond(pred=is_incorrect, true_fn=correct_terminal, false_fn=tf.no_op) with tf.control_dependencies(control_inputs=(corrected, )): assertions = [corrected] if self.config.create_tf_assertions: # general check: all terminal indices true assertions.append( tf.debugging.assert_equal( x=tf.reduce_all(input_tensor=tf.gather( params=tf.math.greater(x=self.buffers['terminal'], y=zero), indices=self.terminal_indices[:self.episode_count + one])), y=tf_util.constant(value=True, dtype='bool'), message="Memory consistency check.")) # general check: only terminal indices true assertions.append( tf.debugging.assert_equal( x=tf.math.count_nonzero( input=self.buffers['terminal'], dtype=tf_util.get_dtype(type='int')), y=(self.episode_count + one), message="Memory consistency check.")) with tf.control_dependencies(control_inputs=assertions): return one < zero
def function(name, spec, a_value): action_value = a_value.apply(x=embedding) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) action_value = tf.reshape(tensor=action_value, shape=shape) if spec.type == 'bool': return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) elif spec.type == 'int': mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.reduce_max(input_tensor=action_value, axis=-1)
def __init__(self, *, layer, l2_regularization=None, name=None, input_spec=None, **kwargs): super().__init__(l2_regularization=l2_regularization, name=name, input_spec=input_spec) self.keras_layer = getattr(tf.keras.layers, layer)( name=name, dtype=tf_util.get_dtype(type='float'), input_shape=input_spec.shape, **kwargs)
def sample(self, *, parameters, temperature): true_logit, false_logit, probability = parameters.get( ('true_logit', 'false_logit', 'probability')) # Distribution parameter summaries def fn_summary(): axis = range(self.action_spec.rank + 1) return tf.math.reduce_mean(input_tensor=probability, axis=axis) name = 'distributions/' + self.name + '-probability' dependencies = self.summary(label='distribution', name=name, data=fn_summary, step='timesteps') # Entropy summary def fn_summary(): one = tf_util.constant(value=1.0, dtype='float') entropy = -probability * true_logit - (one - probability) * false_logit return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')) half = tf_util.constant(value=0.5, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: true if >= 0.5 definite = tf.greater_equal(x=probability, y=half) # Non-deterministic: sample true if >= uniform distribution e_true_logit = tf.math.exp(x=(true_logit / temperature)) e_false_logit = tf.math.exp(x=(false_logit / temperature)) probability = e_true_logit / (e_true_logit + e_false_logit) uniform = tf.random.uniform(shape=tf.shape(input=probability), dtype=tf_util.get_dtype(type='float')) sampled = tf.greater_equal(x=probability, y=uniform) with tf.control_dependencies(control_inputs=dependencies): return tf.where(condition=(temperature < epsilon), x=definite, y=sampled)
def negate_deltas(): neg_two_float = tf_util.constant(value=-2.0, dtype='float') assignments = list() for variable, delta in zip(variables, deltas): if variable.dtype == tf_util.get_dtype(type='float'): assignments.append( variable.assign_add(delta=(neg_two_float * delta), read_value=False)) else: _ng_two_float = tf.constant(value=-2.0, dtype=variable.dtype) assignments.append( variable.assign_add(delta=(_ng_two_float * delta), read_value=False)) with tf.control_dependencies(control_inputs=assignments): return [tf.math.negative(x=delta) for delta in deltas]
def parametrize(self, *, x, conditions): epsilon = tf_util.constant(value=util.epsilon, dtype='float') shape = (-1, ) + self.action_spec.shape + ( self.action_spec.num_values, ) # Action values action_values = self.action_values.apply(x=x) action_values = tf.reshape(tensor=action_values, shape=shape) # States value if self.state_value is None: # Implicit states value (TODO: experimental) states_value = tf.reduce_logsumexp(input_tensor=action_values, axis=-1) else: # Explicit states value and advantage-based action values states_value = self.state_value.apply(x=x) states_value = tf.reshape(tensor=states_value, shape=shape[:-1]) action_values = tf.expand_dims(input=states_value, axis=-1) + action_values action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True) # Masking (TODO: before or after above?) if self.config.enable_int_action_masking: min_float = tf.fill(dims=tf.shape(input=action_values), value=tf_util.get_dtype(type='float').min) action_values = tf.where(condition=conditions['mask'], x=action_values, y=min_float) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=action_values, axis=-1) # "Normalized" logits logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon)) return TensorDict(logits=logits, probabilities=probabilities, states_value=states_value, action_values=action_values)
def function(name, spec, a_value): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) action_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) action_value = tf.reshape(tensor=action_value, shape=shape) if spec.type == 'bool': return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) elif spec.type == 'int': if self.config.enable_int_action_masking: mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.reduce_max(input_tensor=action_value, axis=-1)
def subsampled_step(): subsampled_arguments = TensorDict() indices = tf.random.uniform( shape=(fraction,), maxval=batch_size, dtype=tf_util.get_dtype(type='int') ) if 'states' in arguments and 'horizons' in arguments: horizons = tf.gather(params=arguments['horizons'], indices=indices) starts = horizons[:, 0] lengths = horizons[:, 1] states_indices = tf.ragged.range(starts=starts, limits=(starts + lengths)).values function = (lambda x: tf.gather(params=x, indices=states_indices)) subsampled_arguments['states'] = arguments['states'].fmap(function=function) starts = tf.math.cumsum(x=lengths, exclusive=True) subsampled_arguments['horizons'] = tf.stack(values=(starts, lengths), axis=1) for name, argument in arguments.items(): if name not in subsampled_arguments: subsampled_arguments[name] = tf.gather(params=argument, indices=indices) return self.optimizer.step(arguments=subsampled_arguments, **kwargs)
def __init__(self, *, layer, l2_regularization=None, name=None, input_spec=None, **kwargs): super().__init__(l2_regularization=l2_regularization, name=name, input_spec=input_spec) self.keras_layer = getattr(tf.keras.layers, layer)( name=name, dtype=tf_util.get_dtype(type='float'), input_shape=input_spec.shape, **kwargs) self.architecture_kwargs['layer'] = str(layer) if l2_regularization is not None: self.architecture_kwargs['l2_regularization'] = str( l2_regularization)
def sample(self, *, parameters, temperature): mean, stddev, log_stddev = parameters.get( ('mean', 'stddev', 'log_stddev')) # Distribution parameter and entropy summaries def fn_summary(): return tf.math.reduce_mean(input_tensor=mean, axis=range(self.action_spec.rank + 1)), \ tf.math.reduce_mean(input_tensor=stddev, axis=range(self.action_spec.rank + 1)) prefix = 'distributions/' + self.name dependencies = self.summary(label='distribution', name=(prefix + '-mean', prefix + '-stddev'), data=fn_summary, step='timesteps') # Entropy summary def fn_summary(): half_log_two_pi_e = tf_util.constant( value=(0.5 * np.log(2.0 * np.pi * np.e)), dtype='float') entropy = log_stddev + half_log_two_pi_e return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')) normal_distribution = tf.random.normal( shape=tf.shape(input=mean), dtype=tf_util.get_dtype(type='float')) with tf.control_dependencies(control_inputs=dependencies): action = mean + stddev * temperature * normal_distribution # Bounded transformation if self.bounded_transform is not None: if self.bounded_transform == 'tanh': action = tf.math.tanh(x=action) if self.action_spec.min_value is not None and \ self.action_spec.max_value is not None: one = tf_util.constant(value=1.0, dtype='float') half = tf_util.constant(value=0.5, dtype='float') min_value = tf_util.constant( value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant( value=self.action_spec.max_value, dtype='float') action = min_value + (max_value - min_value) * half * (action + one) elif self.action_spec.min_value is not None: min_value = tf_util.constant( value=self.action_spec.min_value, dtype='float') action = tf.maximum(x=min_value, y=action) else: assert self.action_spec.max_value is not None max_value = tf_util.constant( value=self.action_spec.max_value, dtype='float') action = tf.minimum(x=max_value, y=action) return action
def parametrize(self, *, x, conditions): epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') log_two = tf_util.constant(value=np.log(2.0), dtype='float') # Action values action_values = self.action_values.apply(x=x) shape = (-1, ) + self.action_spec.shape + ( self.action_spec.num_values, ) action_values = tf.reshape(tensor=action_values, shape=shape) # Softplus standard deviation if self.temperature_mode == 'global': multiples = (tf.shape(input=x)[0], ) + tuple( 1 for _ in range(self.action_spec.rank + 1)) softplus_temperature = tf.tile(input=self.softplus_temperature, multiples=multiples) elif self.temperature_mode == 'predicted': softplus_temperature = self.softplus_temperature.apply(x=x) shape = (-1, ) + self.action_spec.shape + (1, ) softplus_temperature = tf.reshape(tensor=softplus_temperature, shape=shape) if self.temperature_mode is None: # Logits logits = action_values # Implicit states value state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) else: # Clip softplus_temperature for numerical stability (epsilon < 1.0, hence negative) softplus_temperature = tf.clip_by_value( t=softplus_temperature, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) # Softplus transformation (based on https://arxiv.org/abs/2007.06059) softplus_shift = tf_util.constant(value=0.2, dtype='float') temperature = (tf.nn.softplus(features=softplus_temperature) + softplus_shift) / \ (log_two + softplus_shift) # Logits logits = action_values / temperature # Implicit states value temperature = tf.squeeze(input=temperature, axis=-1) state_value = temperature * tf.reduce_logsumexp( input_tensor=logits, axis=-1) # # Explicit states value and advantage-based action values # state_value = self.state_value.apply(x=x) # state_value = tf.reshape(tensor=state_value, shape=shape[:-1]) # action_values = tf.expand_dims(input=state_value, axis=-1) + action_values # action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True) # Action masking, affects action_values/probabilities/logits but not state_value if self.config.enable_int_action_masking: min_float = tf.fill(dims=tf.shape(input=action_values), value=tf_util.get_dtype(type='float').min) action_values = tf.where(condition=conditions['mask'], x=action_values, y=min_float) logits = tf.where(condition=conditions['mask'], x=logits, y=min_float) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=logits, axis=-1) # "Normalized" logits logits = tf.math.log(x=(probabilities + epsilon)) # Unstable # logits = tf.nn.log_softmax(logits=logits, axis=-1) # Doesn't take masking into account # logits = action_values - tf.expand_dims(input=state_value, axis=-1) ... / temperature if self.temperature_mode is None: return TensorDict(probabilities=probabilities, logits=logits, action_values=action_values, state_value=state_value) else: return TensorDict(probabilities=probabilities, temperature=temperature, logits=logits, action_values=action_values, state_value=state_value)
def observe(self, *, terminal, reward, parallel): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') expanded_parallel = tf.expand_dims(input=tf.expand_dims(input=parallel, axis=0), axis=1) is_terminal = tf.math.greater(x=terminal[-1], y=zero) # Input assertions assertions = list() if self.config.create_tf_assertions: assertions.extend( self.terminal_spec.tf_assert( x=terminal, batch_size=batch_size, message='Agent.observe: invalid {issue} for terminal input.' )) assertions.extend( self.reward_spec.tf_assert( x=reward, batch_size=batch_size, message='Agent.observe: invalid {issue} for terminal input.' )) assertions.extend( self.parallel_spec.tf_assert( x=parallel, message='Agent.observe: invalid {issue} for parallel input.' )) # Assertion: at most one terminal num_terms = tf.math.count_nonzero( input=terminal, dtype=tf_util.get_dtype(type='int')) assertions.append( tf.debugging.assert_less_equal( x=num_terms, y=one, message= "Agent.observe: input contains more than one terminal.")) # Assertion: if terminal, last timestep in batch assertions.append( tf.debugging.assert_equal( x=tf.math.greater(x=num_terms, y=zero), y=is_terminal, message= "Agent.observe: terminal is not the last input timestep.")) with tf.control_dependencies(control_inputs=assertions): dependencies = list() # Reward summary if self.summaries == 'all' or 'reward' in self.summaries: with self.summarizer.as_default(): x = tf.math.reduce_mean(input_tensor=reward) dependencies.append( tf.summary.scalar(name='reward', data=x, step=self.timesteps)) # Update episode length/reward updates = tf.expand_dims(input=batch_size, axis=0) value = tf.tensor_scatter_nd_add(tensor=self.episode_length, indices=expanded_parallel, updates=updates) dependencies.append(self.episode_length.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=batch_size, indices=parallel) # dependencies.append(self.episode_length.scatter_add(sparse_delta=sparse_delta)) sum_reward = tf.math.reduce_sum(input_tensor=reward, keepdims=True) value = tf.tensor_scatter_nd_add(tensor=self.episode_reward, indices=expanded_parallel, updates=sum_reward) dependencies.append(self.episode_reward.assign(value=value)) # sum_reward = tf.math.reduce_sum(input_tensor=reward) # sparse_delta = tf.IndexedSlices(values=sum_reward, indices=parallel) # dependencies.append(self.episode_reward.scatter_add(sparse_delta=sparse_delta)) # Core observe (before terminal handling) updated = self.core_observe(terminal=terminal, reward=reward, parallel=parallel) dependencies.append(updated) # Handle terminal (after core observe and episode reward) with tf.control_dependencies(control_inputs=dependencies): def fn_terminal(): operations = list() # Reset internals def function(spec, initial): return tf_util.constant(value=initial, dtype=spec.type) initials = self.internals_spec.fmap( function=function, cls=TensorDict, zip_values=self.initial_internals) for name, previous, initial in self.previous_internals.zip_items( initials): updates = tf.expand_dims(input=initial, axis=0) value = tf.tensor_scatter_nd_update( tensor=previous, indices=expanded_parallel, updates=updates) operations.append(previous.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=initial, indices=parallel) # operations.append(previous.scatter_update(sparse_delta=sparse_delta)) # Episode length/reward summaries (before episode reward reset / episodes increment) dependencies = list() if self.summaries == 'all' or 'reward' in self.summaries: with self.summarizer.as_default(): x = tf.gather(params=self.episode_length, indices=parallel) dependencies.append( tf.summary.scalar(name='episode-length', data=x, step=self.episodes)) x = tf.gather(params=self.episode_reward, indices=parallel) dependencies.append( tf.summary.scalar(name='episode-reward', data=x, step=self.episodes)) # Reset episode length/reward with tf.control_dependencies(control_inputs=dependencies): zeros = tf_util.zeros(shape=(1, ), dtype='int') value = tf.tensor_scatter_nd_update( tensor=self.episode_length, indices=expanded_parallel, updates=zeros) operations.append(self.episode_length.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel) # operations.append(self.episode_length.scatter_update(sparse_delta=sparse_delta)) zeros = tf_util.zeros(shape=(1, ), dtype='float') value = tf.tensor_scatter_nd_update( tensor=self.episode_reward, indices=expanded_parallel, updates=zeros) operations.append(self.episode_reward.assign(value=value)) # zero_float = tf_util.constant(value=0.0, dtype='float') # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel) # operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta)) # Increment episodes counter operations.append( self.episodes.assign_add(delta=one, read_value=False)) return tf.group(*operations) handle_terminal = tf.cond(pred=is_terminal, true_fn=fn_terminal, false_fn=tf.no_op) with tf.control_dependencies(control_inputs=(handle_terminal, )): episodes = tf_util.identity(input=self.episodes) updates = tf_util.identity(input=self.updates) return updated, episodes, updates
def enqueue(self, *, states, internals, auxiliaries, actions, terminal, reward): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') three = tf_util.constant(value=3, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') num_timesteps = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity) def correct_terminal(): # Remove last observation terminal marker sparse_delta = tf.IndexedSlices(values=zero, indices=last_index) assignment = self.buffers['terminal'].scatter_update( sparse_delta=sparse_delta) with tf.control_dependencies(control_inputs=(assignment, )): return last_index < zero last_terminal = tf.gather(params=self.buffers['terminal'], indices=last_index) is_incorrect = tf.math.equal(x=last_terminal, y=three) corrected = tf.cond(pred=is_incorrect, true_fn=correct_terminal, false_fn=tf.no_op) # Assertions last_terminal = tf.concat(values=([zero], terminal), axis=0)[-1] assertions = [corrected] if self.config.create_tf_assertions: with tf.control_dependencies(control_inputs=(corrected, )): # check: number of timesteps fit into effectively available buffer assertions.append( tf.debugging.assert_less_equal( x=num_timesteps, y=capacity, message="Memory does not have enough capacity.")) # at most one terminal assertions.append( tf.debugging.assert_less_equal( x=tf.math.count_nonzero( input=terminal, dtype=tf_util.get_dtype(type='int')), y=one, message="Timesteps contain more than one terminal.")) # if terminal, last timestep in batch assertions.append( tf.debugging.assert_equal( x=tf.math.reduce_any( input_tensor=tf.math.greater(x=terminal, y=zero)), y=tf.math.greater(x=last_terminal, y=zero), message="Terminal is not the last timestep.")) # general check: all terminal indices true assertions.append( tf.debugging.assert_equal( x=tf.reduce_all(input_tensor=tf.gather( params=tf.math.greater(x=self.buffers['terminal'], y=zero), indices=self.terminal_indices[:self.episode_count + one])), y=tf_util.constant(value=True, dtype='bool'), message="Memory consistency check.")) # general check: only terminal indices true assertions.append( tf.debugging.assert_equal( x=tf.math.count_nonzero( input=self.buffers['terminal'], dtype=tf_util.get_dtype(type='int')), y=(self.episode_count + one), message="Memory consistency check.")) # Buffer indices to overwrite with tf.control_dependencies(control_inputs=assertions): overwritten_indices = tf.range(start=self.buffer_index, limit=(self.buffer_index + num_timesteps)) overwritten_indices = tf.math.mod(x=overwritten_indices, y=capacity) # Count number of overwritten episodes num_episodes = tf.math.count_nonzero( input=tf.gather(params=self.buffers['terminal'], indices=overwritten_indices), axis=0, dtype=tf_util.get_dtype(type='int')) # Shift remaining terminal indices accordingly index = self.episode_count + one assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal( x=index, y=num_episodes, message="Memory episode overwriting check.")) with tf.control_dependencies(control_inputs=assertions): sparse_delta = tf.IndexedSlices( values=self.terminal_indices[num_episodes:index], indices=tf.range(index - num_episodes)) assignment = self.terminal_indices.scatter_update( sparse_delta=sparse_delta) # Decrement episode count accordingly with tf.control_dependencies(control_inputs=(assignment, )): assignment = self.episode_count.assign_sub(delta=num_episodes, read_value=False) # Write new observations with tf.control_dependencies(control_inputs=(assignment, )): # Add last observation terminal marker corrected_terminal = tf.where(condition=tf.math.equal( x=terminal[-1:], y=zero), x=three, y=terminal[-1:]) corrected_terminal = tf.concat(values=(terminal[:-1], corrected_terminal), axis=0) values = TensorDict(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, terminal=corrected_terminal, reward=reward) indices = tf.range(start=self.buffer_index, limit=(self.buffer_index + num_timesteps)) indices = tf.math.mod(x=indices, y=capacity) def function(buffer, value): sparse_delta = tf.IndexedSlices(values=value, indices=indices) return buffer.scatter_update(sparse_delta=sparse_delta) assignments = self.buffers.fmap(function=function, cls=list, zip_values=values) # Increment buffer index with tf.control_dependencies(control_inputs=assignments): assignment = self.buffer_index.assign_add(delta=num_timesteps, read_value=False) # Count number of new episodes with tf.control_dependencies(control_inputs=(assignment, )): num_new_episodes = tf.math.count_nonzero( input=terminal, dtype=tf_util.get_dtype(type='int')) # Write new terminal indices new_terminal_indices = tf.boolean_mask(tensor=overwritten_indices, mask=tf.math.greater( x=terminal, y=zero)) start = self.episode_count + one sparse_delta = tf.IndexedSlices( values=new_terminal_indices, indices=tf.range(start=start, limit=(start + num_new_episodes))) assignment = self.terminal_indices.scatter_update( sparse_delta=sparse_delta) # Increment episode count accordingly with tf.control_dependencies(control_inputs=(assignment, )): assignment = self.episode_count.assign_add(delta=num_new_episodes) return assignment < zero
def successors(self, *, indices, horizon, sequence_values, final_values): assert isinstance(sequence_values, tuple) assert isinstance(final_values, tuple) zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') capacity = tf_util.constant(value=self.capacity, dtype='int') def body(lengths, successor_indices, mask): current_index = successor_indices[:, -1:] current_terminal = tf.gather(params=self.buffers['terminal'], indices=current_index) is_not_terminal = tf.math.logical_and(x=tf.math.logical_not( x=tf.math.greater(x=current_terminal, y=zero)), y=mask[:, -1:]) next_index = tf.math.mod(x=(current_index + one), y=capacity) successor_indices = tf.concat(values=(successor_indices, next_index), axis=1) mask = tf.concat(values=(mask, is_not_terminal), axis=1) is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1) zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int')) lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros) return lengths, successor_indices, mask lengths = tf.ones_like(input=indices, dtype=tf_util.get_dtype(type='int')) successor_indices = tf.expand_dims(input=indices, axis=1) mask = tf.ones_like(input=successor_indices, dtype=tf_util.get_dtype(type='bool')) shape = tf.TensorShape(dims=((None, None))) lengths, successor_indices, mask = tf.while_loop( cond=tf_util.always_true, body=body, loop_vars=(lengths, successor_indices, mask), shape_invariants=(lengths.get_shape(), shape, shape), maximum_iterations=tf_util.int32(x=horizon)) successor_indices = tf.reshape(tensor=successor_indices, shape=(-1, )) mask = tf.reshape(tensor=mask, shape=(-1, )) successor_indices = tf.boolean_mask(tensor=successor_indices, mask=mask, axis=0) assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_greater_equal(x=tf.math.mod( x=(self.buffer_index - one - successor_indices), y=capacity), y=zero, message="Successor check.")) with tf.control_dependencies(control_inputs=assertions): function = (lambda buffer: tf.gather(params=buffer, indices=successor_indices)) values = self.buffers[sequence_values].fmap(function=function, cls=TensorDict) sequence_values = tuple(values[name] for name in sequence_values) starts = tf.math.cumsum(x=lengths, exclusive=True) ends = tf.math.cumsum(x=lengths) - one final_indices = tf.gather(params=successor_indices, indices=ends) function = ( lambda buffer: tf.gather(params=buffer, indices=final_indices)) values = self.buffers[final_values].fmap(function=function, cls=TensorDict) final_values = tuple(values[name] for name in final_values) if len(sequence_values) == 0: if len(final_values) == 0: return lengths else: return lengths, final_values elif len(final_values) == 0: return tf.stack(values=(starts, lengths), axis=1), sequence_values else: return tf.stack(values=(starts, lengths), axis=1), sequence_values, final_values
def fn_sample(): normal_distribution = tf.random.normal( shape=tf.shape(input=mean), dtype=tf_util.get_dtype(type='float')) return mean + stddev * temperature * normal_distribution
def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): assert len(internals) == 0 actions = TensorDict() for name, spec in self.actions_spec.items(): shape = tf.concat(values=(tf_util.cast( x=tf.shape(input=states.value())[:1], dtype='int'), tf_util.constant(value=spec.shape, dtype='int')), axis=0) if spec.type == 'bool': # Random bool action: uniform[True, False] half = tf_util.constant(value=0.5, dtype='float') uniform = tf.random.uniform( shape=shape, dtype=tf_util.get_dtype(type='float')) actions[name] = (uniform < half) elif self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # Random masked action: uniform[unmasked] # (Similar code as for Model.apply_exploration) mask = auxiliaries[name]['mask'] choices = tf_util.constant(value=list(range(spec.num_values)), dtype=spec.type, shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values))) one = tf_util.constant(value=1, dtype='int', shape=(1, )) multiples = tf.concat(values=(shape, one), axis=0) choices = tf.tile(input=choices, multiples=multiples) choices = tf.boolean_mask(tensor=choices, mask=mask) mask = tf_util.cast(x=mask, dtype='int') num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) num_valid = tf.reshape(tensor=num_valid, shape=(-1, )) masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) uniform = tf.random.uniform( shape=shape, dtype=tf_util.get_dtype(type='float')) uniform = tf.reshape(tensor=uniform, shape=(-1, )) num_valid = tf_util.cast(x=num_valid, dtype='float') random_offset = tf.dtypes.cast(x=(uniform * num_valid), dtype=tf.dtypes.int64) action = tf.gather(params=choices, indices=(masked_offset + random_offset)) actions[name] = tf.reshape(tensor=action, shape=shape) elif spec.type != 'bool' and spec.min_value is not None: if spec.max_value is not None: # Random bounded action: uniform[min_value, max_value] actions[name] = tf.random.uniform(shape=shape, minval=spec.min_value, maxval=spec.max_value, dtype=spec.tf_type()) else: # Random left-bounded action: not implemented raise NotImplementedError elif spec.type != 'bool' and spec.max_value is not None: # Random right-bounded action: not implemented raise NotImplementedError else: # Random unbounded int/float action actions[name] = tf.random.normal(shape=shape, dtype=spec.tf_type()) return actions, TensorDict()
def update(self, *, arguments, variables, **kwargs): assert self.is_initialized_given_variables assert all(variable.dtype.is_floating for variable in variables) deltas = self.step(arguments=arguments, variables=variables, **kwargs) assertions = list(deltas) if self.config.create_debug_assertions: from tensorforce.core.optimizers import DoublecheckStep, NaturalGradient, \ Synchronization, UpdateModifier optimizer = self while isinstance(optimizer, UpdateModifier): if isinstance(optimizer, DoublecheckStep): break optimizer = optimizer.optimizer if not isinstance(optimizer, DoublecheckStep) and ( not isinstance(optimizer, NaturalGradient) or not optimizer.only_positive_updates) and ( not isinstance(self, Synchronization) or self.sync_frequency is None): for delta, variable in zip(deltas, variables): if '_distribution/mean/linear/' in variable.name: # Gaussian.state_value does not use mean continue # if variable.name.endswith('/bias:0') and isinstance(self, Synchronization) \ # and self.root.updates.numpy() == 0: # # Initialization values are equivalent for bias # continue assertions.append( tf.debugging.assert_equal(x=tf.math.logical_or( x=tf.math.reduce_all(input_tensor=tf.math.greater( x=tf.math.count_nonzero( input=delta, dtype=tf_util.get_dtype(type='int')), y=tf_util.constant(value=0, dtype='int'))), y=tf.reduce_all(input_tensor=tf.math.equal( x=arguments['reward'], y=tf_util.constant(value=0.0, dtype='float')))), y=tf_util.constant( value=True, dtype='bool'), message=variable.name)) with tf.control_dependencies(control_inputs=assertions): dependencies = list() if self.root.summaries == 'all' or 'update-norm' in self.root.summaries: with self.root.summarizer.as_default(): x = tf.linalg.global_norm(t_list=[ tf_util.cast(x=delta, dtype='float') for delta in deltas ]) dependencies.append( tf.summary.scalar(name='update-norm', data=x, step=self.root.updates)) if self.root.summaries == 'all' or 'updates' in self.root.summaries: with self.root.summarizer.as_default(): for var in variables: assert var.name.startswith( self.root.name + '/') and var.name[-2:] == ':0' mean_name = var.name[len(self.root.name) + 1:-2] + '-mean' var_name = var.name[len(self.root.name) + 1:-2] + '-variance' mean, variance = tf.nn.moments( x=var, axes=list(range(tf_util.rank(x=var)))) dependencies.append( tf.summary.scalar(name=mean_name, data=mean, step=self.root.updates)) dependencies.append( tf.summary.scalar(name=var_name, data=variance, step=self.root.updates)) with tf.control_dependencies(control_inputs=dependencies): return tf_util.identity( input=tf_util.constant(value=True, dtype='bool'))