def get_output_spec(self, input_spec): if util.product(xs=input_spec['shape']) != util.product(xs=self.shape): raise TensorforceError.value(name='Reshape', argument='shape', value=self.shape) input_spec['shape'] = self.shape return input_spec
def tf_kl_divergence(self, states, internals, auxiliaries, other=None, reduced=True, include_per_action=False): kl_divergences = self.kl_divergences(states=states, internals=internals, auxiliaries=auxiliaries, other=other) for name, spec, kl_divergence in util.zip_items( self.actions_spec, kl_divergences): kl_divergences[name] = tf.reshape( tensor=kl_divergence, shape=(-1, util.product(xs=spec['shape']))) kl_divergence = tf.concat(values=tuple(kl_divergences.values()), axis=1) if reduced: kl_divergence = tf.math.reduce_sum(input_tensor=kl_divergence, axis=1) if include_per_action: kl_divergences['*'] = kl_divergence return kl_divergences else: return kl_divergence
def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals ) else: embedding = self.network.apply( x=states, internals=internals, return_internals=return_internals ) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) action = distribution.sample(parameters=parameters, deterministic=deterministic) entropy = distribution.entropy(parameters=parameters) entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1) actions[name] = self.add_summary( label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action ) if return_internals: return actions, internals else: return actions
def tf_actions_value(self, states, internals, auxiliaries, actions, reduced=True, include_per_action=False): actions_values = self.actions_values(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions) for name, spec, actions_value in util.zip_items( self.actions_spec, actions_values): actions_values[name] = tf.reshape( tensor=actions_value, shape=(-1, util.product(xs=spec['shape']))) actions_value = tf.concat(values=tuple(actions_values.values()), axis=1) if reduced: actions_value = tf.math.reduce_mean(input_tensor=actions_value, axis=1) if include_per_action: for name in self.actions_spec: actions_values[name] = tf.math.reduce_mean( input_tensor=actions_values[name], axis=1) if include_per_action: actions_values['*'] = actions_value return actions_values else: return actions_value
def __init__(self, name, action_spec, embedding_shape, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_shape=embedding_shape, summary_labels=summary_labels) input_spec = dict(type='float', shape=self.embedding_shape) if len(self.embedding_shape) == 1: action_size = util.product(xs=self.action_spec['shape'], empty=0) self.alpha = self.add_module(name='alpha', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) self.beta = self.add_module(name='beta', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) else: if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3: raise TensorforceError.value(name=name, argument='embedding_shape', value=self.embedding_shape, hint='invalid rank') if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]: size = self.action_spec['shape'][-1] elif self.embedding_shape[:-1] == self.action_spec['shape']: size = 0 else: raise TensorforceError.value( name=name, argument='embedding_shape', value=self.embedding_shape, hint='not flattened and incompatible with action shape') self.alpha = self.add_module(name='alpha', module='linear', modules=layer_modules, size=size, input_spec=input_spec) self.beta = self.add_module(name='beta', module='linear', modules=layer_modules, size=size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-alpha'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True) Module.register_tensor(name=(self.name + '-beta'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True)
def tf_entropy(self, states, internals, auxiliaries, reduced=True, include_per_action=False): entropies = self.entropies(states=states, internals=internals, auxiliaries=auxiliaries) for name, spec, entropy in util.zip_items(self.actions_spec, entropies): entropies[name] = tf.reshape( tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) entropy = tf.concat(values=tuple(entropies.values()), axis=1) if reduced: entropy = tf.math.reduce_mean(input_tensor=entropy, axis=1) if include_per_action: for name in self.actions_spec: entropies[name] = tf.math.reduce_mean( input_tensor=entropies[name], axis=1) if include_per_action: entropies['*'] = entropy return entropies else: return entropy
def tf_regularize(self, states, internals): regularization_loss = super().tf_regularize(states=states, internals=internals) entropies = list() embedding = self.network.apply(x=states, internals=internals) for name, distribution in self.distributions.items(): parameters = distribution.parametrize(x=embedding) entropy = distribution.entropy(parameters=parameters) collapsed_size = util.product(xs=util.shape(entropy)[1:]) entropy = tf.reshape(tensor=entropy, shape=(-1, collapsed_size)) entropies.append(entropy) entropies = tf.concat(values=entropies, axis=1) entropy_per_instance = tf.reduce_mean(input_tensor=entropies, axis=1) entropy = tf.reduce_mean(input_tensor=entropy_per_instance, axis=0) # entropy = self.add_summary(label='entropy', name='entropy', tensor=entropy) entropy_regularization = self.entropy_regularization.value() regularization_loss = regularization_loss - entropy_regularization * entropy # def no_entropy_reg(): # return regularization_loss # def apply_entropy_reg(): # # ... # return regularization_loss - entropy_regularization * entropy # zero = tf.constant(value=0.0, dtype=util.tf_dtype(dtype='float')) # skip_entropy_reg = tf.math.equal(x=entropy_regularization, y=zero) # regularization_loss = self.cond(pred=skip_entropy_reg, true_fn=no_entropy_reg, false_fn=apply_entropy_reg) return regularization_loss
def tf_loss_per_instance( self, states, internals, actions, terminal, reward, next_states, next_internals, reference=None ): embedding = self.network.apply(x=states, internals=internals) log_probs = list() for name, distribution, action in util.zip_items(self.distributions, actions): parameters = distribution.parametrize(x=embedding) log_prob = distribution.log_probability(parameters=parameters, action=action) collapsed_size = util.product(xs=util.shape(log_prob)[1:]) log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) log_probs.append(log_prob) log_probs = tf.concat(values=log_probs, axis=1) if reference is None: old_log_probs = tf.stop_gradient(input=log_probs) else: old_log_probs = reference # Comment on log_ratio 1.0 and gradient perspective prob_ratios = tf.exp(x=(log_probs - old_log_probs)) prob_ratio_per_instance = tf.reduce_mean(input_tensor=prob_ratios, axis=1) likelihood_ratio_clipping = self.likelihood_ratio_clipping.value() clipped_prob_ratio_per_instance = tf.clip_by_value( t=prob_ratio_per_instance, clip_value_min=(1.0 / (1.0 + likelihood_ratio_clipping)), clip_value_max=(1.0 + likelihood_ratio_clipping) ) return -tf.minimum( x=(prob_ratio_per_instance * reward), y=(clipped_prob_ratio_per_instance * reward) )
def __init__(self, name, action_spec, embedding_size, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels) action_size = util.product(xs=self.action_spec['shape'], empty=0) input_spec = dict(type='float', shape=(self.embedding_size, )) self.mean = self.add_module(name='mean', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) self.log_stddev = self.add_module(name='log-stddev', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-mean'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True) Module.register_tensor(name=(self.name + '-stddev'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True)
def __init__(self, name, action_spec, embedding_size, infer_states_value=True, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels) shape = self.action_spec['shape'] num_values = self.action_spec['num_values'] action_size = util.product(xs=shape) input_spec = dict(type='float', shape=(self.embedding_size, )) self.deviations = self.add_module(name='deviations', module='linear', modules=layer_modules, size=(action_size * num_values), input_spec=input_spec) if infer_states_value: self.value = None else: self.value = self.add_module(name='value', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-probabilities'), spec=dict(type='float', shape=(shape + (num_values, ))), batched=True)
def tf_log_probability(self, states, internals, auxiliaries, actions, reduced=True, include_per_action=False): log_probabilities = self.log_probabilities(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions) for name, spec, log_probability in util.zip_items( self.actions_spec, log_probabilities): log_probabilities[name] = tf.reshape( tensor=log_probability, shape=(-1, util.product(xs=spec['shape']))) log_probability = tf.concat(values=tuple(log_probabilities.values()), axis=1) if reduced: log_probability = tf.math.reduce_sum(input_tensor=log_probability, axis=1) if include_per_action: log_probabilities['*'] = log_probability return log_probabilities else: return log_probability
def tf_loss_per_instance( self, states, internals, actions, terminal, reward, next_states, next_internals, reference=None ): # Really state value instead of q value? # Michael: doubling this function because NAF needs V'(s) not Q'(s), see comment below embedding = self.network.apply(x=states, internals=internals) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_internals = OrderedDict() for name, internal in next_internals.items(): target_internals['target-' + name] = internal Module.update_tensors(**target_internals) target_embedding = self.target_network.apply(x=next_states, internals=target_internals) deltas = list() for name in sorted(self.distributions): distribution = self.distributions[name] target_distribution = self.target_distributions[name] parameters = distribution.parametrize(x=embedding) target_parameters = target_distribution.parametrize(x=target_embedding) q_value = self.tf_q_value( embedding=embedding, parameters=parameters, action=actions[name], name=name ) # Notice, this is V', not Q' because NAF outputs V(s) separately next_state_value = target_distribution.states_value(parameters=target_parameters) delta = self.tf_q_delta( q_value=q_value, next_q_value=next_state_value, terminal=terminal, reward=reward ) collapsed_size = util.product(xs=util.shape(delta)[1:]) delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) # Optional Huber loss huber_loss = self.huber_loss.value() def no_huber_loss(): return tf.square(x=loss_per_instance) def apply_huber_loss(): return tf.where( condition=(tf.abs(x=loss_per_instance) <= huber_loss), x=(0.5 * tf.square(x=loss_per_instance)), y=(huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * huber_loss)) ) zero = tf.constant(value=0.0, dtype=util.tf_dtype(dtype='float')) skip_huber_loss = tf.math.equal(x=huber_loss, y=zero) return self.cond(pred=skip_huber_loss, true_fn=no_huber_loss, false_fn=apply_huber_loss)
def output_spec(self): output_spec = super().output_spec() if output_spec.size != util.product(xs=self.shape): raise TensorforceError.value(name='Reshape', argument='shape', value=self.shape) output_spec.shape = self.shape return output_spec
def get_output_spec(self, input_spec): if self.reduction == 'concat': input_spec['shape'] = (util.product(xs=input_spec['shape']), ) elif self.reduction in ('max', 'mean', 'product', 'sum'): input_spec['shape'] = (input_spec['shape'][-1], ) input_spec.pop('min_value', None) input_spec.pop('max_value', None) return input_spec
def tf_optimization( self, states, internals, actions, terminal, reward, next_states=None, next_internals=None ): """ Creates the TensorFlow operations for performing an optimization update step based on the given input states and actions batch. Args: states: Dict of state tensors. internals: List of prior internal state tensors. actions: Dict of action tensors. terminal: Terminal boolean tensor. reward: Reward tensor. next_states: Dict of successor state tensors. next_internals: List of posterior internal state tensors. Returns: The optimization operation. """ parameters_before = OrderedDict() embedding = self.network.apply(x=states, internals=internals) for name, distribution in self.distributions.items(): parameters_before[name] = distribution.parametrize(x=embedding) with tf.control_dependencies(control_inputs=util.flatten(xs=parameters_before)): optimized = super().tf_optimization( states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals ) with tf.control_dependencies(control_inputs=(optimized,)): summaries = list() embedding = self.network.apply(x=states, internals=internals) for name, distribution in self.distributions.items(): parameters = distribution.parametrize(x=embedding) kl_divergence = distribution.kl_divergence( parameters1=parameters_before[name], parameters2=parameters ) collapsed_size = util.product(xs=util.shape(kl_divergence)[1:]) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, collapsed_size)) kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=1) kl_divergence = self.add_summary( label='kl-divergence', name=(name + '-kldiv'), tensor=kl_divergence ) summaries.append(kl_divergence) entropy = distribution.entropy(parameters=parameters) entropy = tf.reshape(tensor=entropy, shape=(-1, collapsed_size)) entropy = tf.reduce_mean(input_tensor=entropy, axis=1) entropy = self.add_summary( label='entropy', name=(name + '-entropy'), tensor=entropy ) summaries.append(entropy) with tf.control_dependencies(control_inputs=summaries): return util.no_operation()
def embedding(input, indices, size, name='embs'): with tf.compat.v1.variable_scope(name): shape = (indices, size) stddev = min(0.1, sqrt(2.0 / (util.product(xs=shape[:-1]) + shape[-1]))) initializer = tf.random.normal(shape=shape, stddev=stddev, dtype=tf.float32) W = tf.Variable( initial_value=initializer, trainable=True, validate_shape=True, name='W', dtype=tf.float32, shape=shape ) return tf.nn.tanh(tf.compat.v1.nn.embedding_lookup(params=W, ids=input, max_norm=None))
def __init__(self, name, action_spec, embedding_shape, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_shape=embedding_shape, summary_labels=summary_labels) input_spec = dict(type='float', shape=self.embedding_shape) if len(self.embedding_shape) == 1: action_size = util.product(xs=self.action_spec['shape'], empty=0) self.mean = self.add_module(name='mean', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) self.log_stddev = self.add_module(name='log-stddev', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) else: if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3: raise TensorforceError.unexpected() if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]: size = self.action_spec['shape'][-1] elif self.embedding_shape[:-1] == self.action_spec['shape']: size = 0 else: raise TensorforceError.unexpected() self.mean = self.add_module(name='mean', module='linear', modules=layer_modules, size=size, input_spec=input_spec) self.log_stddev = self.add_module(name='log-stddev', module='linear', modules=layer_modules, size=size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-mean'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True) Module.register_tensor(name=(self.name + '-stddev'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True)
def __init__(self, *, name=None, action_spec=None, input_spec=None): assert action_spec.type == 'float' and action_spec.min_value is not None and \ action_spec.max_value is not None parameters_spec = TensorsSpec( alpha=TensorSpec(type='float', shape=action_spec.shape), beta=TensorSpec(type='float', shape=action_spec.shape), alpha_beta=TensorSpec(type='float', shape=action_spec.shape), log_norm=TensorSpec(type='float', shape=action_spec.shape) ) conditions_spec = TensorsSpec() super().__init__( name=name, action_spec=action_spec, input_spec=input_spec, parameters_spec=parameters_spec, conditions_spec=conditions_spec ) if len(self.input_spec.shape) == 1: # Single embedding action_size = util.product(xs=self.action_spec.shape, empty=0) self.alpha = self.submodule( name='alpha', module='linear', modules=layer_modules, size=action_size, initialization_scale=0.01, input_spec=self.input_spec ) self.beta = self.submodule( name='beta', module='linear', modules=layer_modules, size=action_size, initialization_scale=0.01, input_spec=self.input_spec ) else: # Embedding per action if len(self.input_spec.shape) < 1 or len(self.input_spec.shape) > 3: raise TensorforceError.value( name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='invalid rank' ) if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: size = self.action_spec.shape[-1] elif self.input_spec.shape[:-1] == self.action_spec.shape: size = 0 else: raise TensorforceError.value( name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='not flattened and incompatible with action shape' ) self.alpha = self.submodule( name='alpha', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec ) self.beta = self.submodule( name='beta', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec )
def __init__(self, name, action_spec, embedding_size, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels) action_size = util.product(xs=self.action_spec['shape'], empty=0) input_spec = dict(type='float', shape=(self.embedding_size, )) self.logit = self.add_module(name='logit', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec)
def __init__( self, name, action_spec, embedding_shape, infer_states_value=True, summary_labels=None ): super().__init__( name=name, action_spec=action_spec, embedding_shape=embedding_shape, summary_labels=summary_labels ) input_spec = dict(type='float', shape=self.embedding_shape) num_values = self.action_spec['num_values'] if len(self.embedding_shape) == 1: action_size = util.product(xs=self.action_spec['shape']) self.deviations = self.add_module( name='deviations', module='linear', modules=layer_modules, size=(action_size * num_values), input_spec=input_spec ) if infer_states_value: self.value = None else: self.value = self.add_module( name='value', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec ) else: if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3: raise TensorforceError.unexpected() if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]: size = self.action_spec['shape'][-1] elif self.embedding_shape[:-1] == self.action_spec['shape']: size = 1 else: raise TensorforceError.unexpected() self.deviations = self.add_module( name='deviations', module='linear', modules=layer_modules, size=(size * num_values), input_spec=input_spec ) if infer_states_value: self.value = None else: self.value = self.add_module( name='value', module='linear', modules=layer_modules, size=size, input_spec=input_spec ) Module.register_tensor( name=(self.name + '-probabilities'), spec=dict(type='float', shape=(self.action_spec['shape'] + (num_values,))), batched=True )
def __init__(self, *, name=None, action_spec=None, input_spec=None): assert action_spec.type == 'bool' parameters_spec = TensorsSpec( true_logit=TensorSpec(type='float', shape=action_spec.shape), false_logit=TensorSpec(type='float', shape=action_spec.shape), probability=TensorSpec(type='float', shape=action_spec.shape), state_value=TensorSpec(type='float', shape=action_spec.shape)) conditions_spec = TensorsSpec() super().__init__(name=name, action_spec=action_spec, input_spec=input_spec, parameters_spec=parameters_spec, conditions_spec=conditions_spec) if len(self.input_spec.shape) == 1: # Single embedding action_size = util.product(xs=self.action_spec.shape, empty=0) self.logit = self.submodule(name='logit', module='linear', modules=layer_modules, size=action_size, initialization_scale=0.01, input_spec=self.input_spec) else: # Embedding per action if len(self.input_spec.shape) < 1 or len( self.input_spec.shape) > 3: raise TensorforceError.value(name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='invalid rank') if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: size = self.action_spec.shape[-1] elif self.input_spec.shape[:-1] == self.action_spec.shape: size = 0 else: raise TensorforceError.value( name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='not flattened and incompatible with action shape') self.logit = self.submodule(name='logit', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec)
def apply(self, *, x): queries = self.query.apply(x=x) keys = self.key.apply(x=x) values = self.value.apply(x=x) if self.input_spec.rank > 2: batch_size = tf_util.cast(x=tf.shape(input=x)[:1], dtype='int') flattened_shape = tf_util.constant( value=(util.product(xs=self.input_spec.shape[:-1]), self.attention_size), dtype='int') flattened_shape = tf.concat(values=(batch_size, flattened_shape), axis=0) queries = tf.reshape(tensor=queries, shape=flattened_shape) keys = tf.reshape(tensor=keys, shape=flattened_shape) flattened_shape = tf_util.constant( value=(util.product(xs=self.input_spec.shape[:-1]), self.size), dtype='int') flattened_shape = tf.concat(values=(batch_size, flattened_shape), axis=0) values = tf.reshape(tensor=values, shape=flattened_shape) attention = tf.linalg.matmul(a=queries, b=keys, transpose_b=True) attention = attention / tf_util.constant( value=np.sqrt(self.attention_size), dtype='float') attention = tf.nn.softmax(logits=attention, axis=-1) x = tf.linalg.matmul(a=attention, b=values) if self.input_spec.rank > 2: shape = tf_util.constant(value=self.output_spec().shape, dtype='int') shape = tf.concat(values=(batch_size, shape), axis=0) x = tf.reshape(tensor=x, shape=shape) return super().apply(x=x)
def tf_reference( self, states, internals, actions, terminal, reward, next_states, next_internals ): embedding = self.network.apply(x=states, internals=internals) log_probs = list() for name, distribution, action in util.zip_items(self.distributions, actions): parameters = distribution.parametrize(x=embedding) log_prob = distribution.log_probability(parameters=parameters, action=action) collapsed_size = util.product(xs=util.shape(log_prob)[1:]) log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) log_probs.append(log_prob) log_probs = tf.concat(values=log_probs, axis=1) return tf.stop_gradient(input=log_probs)
def tf_entropy(self, states, internals, auxiliaries, mean=True): entropies = self.entropies(states=states, internals=internals, auxiliaries=auxiliaries) for name, spec, entropy in util.zip_items(self.actions_spec, entropies): entropies[name] = tf.reshape( tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) entropy = tf.concat(values=tuple(entropies.values()), axis=1) if mean: entropy = tf.math.reduce_mean(input_tensor=entropy, axis=1) return entropy
def __init__(self, name, action_spec, embedding_size, summary_labels=None): """ Categorical distribution. """ super().__init__(name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels) action_size = util.product( xs=self.action_spec['shape']) * self.action_spec['num_values'] input_spec = dict(type='float', shape=(self.embedding_size, )) self.logits = self.add_module(name='logits', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec)
def tf_states_value(self, states, internals, auxiliaries, mean=True): states_values = self.states_values(states=states, internals=internals, auxiliaries=auxiliaries) for name, spec, states_value in util.zip_items(self.actions_spec, states_values): states_values[name] = tf.reshape( tensor=states_value, shape=(-1, util.product(xs=spec['shape']))) states_value = tf.concat(values=tuple(states_values.values()), axis=1) if mean: states_value = tf.math.reduce_mean(input_tensor=states_value, axis=1) return states_value
def __init__( self, # Model states, actions, scope, device, saver, summarizer, execution, parallel_interactions, buffer_observe, exploration, variable_noise, states_preprocessing, reward_preprocessing, # MemoryModel update_mode, memory, optimizer, discount, # DistributionModel network, distributions, entropy_regularization, # QModel target_sync_frequency, target_update_weight, double_q_model, huber_loss ): if any(spec['type'] != 'float' or 'min_value' in spec or 'max_value' in spec for name, spec in actions.items()): raise TensorforceError("Only unconstrained float actions valid for NAFModel.") super().__init__( # Model states=states, actions=actions, scope=scope, device=device, saver=saver, summarizer=summarizer, execution=execution, parallel_interactions=parallel_interactions, buffer_observe=buffer_observe, exploration=exploration, variable_noise=variable_noise, states_preprocessing=states_preprocessing, reward_preprocessing=reward_preprocessing, # MemoryModel update_mode=update_mode, memory=memory, optimizer=optimizer, discount=discount, # DistributionModel network=network, distributions=distributions, entropy_regularization=entropy_regularization, # QModel target_sync_frequency=target_sync_frequency, target_update_weight=target_update_weight, double_q_model=double_q_model, huber_loss=huber_loss ) self.state_values = OrderedDict() self.l_entries = OrderedDict() embedding_size = self.network.get_output_spec()['shape'][0] input_spec = dict(type='float', shape=(embedding_size,)) for name, action_spec in self.actions_spec.items(): action_size = util.product(xs=action_spec['shape']) self.state_values[name] = self.add_module( name=(name + '-state-value'), module='linear', modules=layer_modules, size=action_size, input_spec=input_spec ) self.l_entries[name] = self.add_module( name=(name + '-l-entries'), module='linear', modules=layer_modules, size=action_size, input_spec=input_spec )
def __init__(self, name, action_spec, embedding_size, summary_labels=None): """ Categorical distribution. """ super().__init__( name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels ) action_size = util.product(xs=self.action_spec['shape'], empty=0) input_spec = dict(type='float', shape=(self.embedding_size,)) self.mean = self.add_module( name='mean', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec ) self.log_stddev = self.add_module( name='log-stddev', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec )
def __init__(self, name, action_spec, embedding_shape, summary_labels=None): super().__init__( name=name, action_spec=action_spec, embedding_shape=embedding_shape, summary_labels=summary_labels ) input_spec = dict(type='float', shape=self.embedding_shape) num_values = self.action_spec['num_values'] if len(self.embedding_shape) == 1: action_size = util.product(xs=self.action_spec['shape']) self.deviations = self.add_module( name='deviations', module='linear', modules=layer_modules, size=(action_size * num_values), input_spec=input_spec ) else: if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3: raise TensorforceError.value( name=name, argument='embedding_shape', value=self.embedding_shape, hint='invalid rank' ) if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]: size = self.action_spec['shape'][-1] elif self.embedding_shape[:-1] == self.action_spec['shape']: size = 1 else: raise TensorforceError.value( name=name, argument='embedding_shape', value=self.embedding_shape, hint='not flattened and incompatible with action shape' ) self.deviations = self.add_module( name='deviations', module='linear', modules=layer_modules, size=(size * num_values), input_spec=input_spec ) Module.register_tensor( name=(self.name + '-probabilities'), spec=dict(type='float', shape=(self.action_spec['shape'] + (num_values,))), batched=True )
def tf_q_value(self, embedding, parameters, action, name): num_action = util.product(xs=self.actions_spec[name]['shape']) mean, stddev, _ = parameters flat_mean = tf.reshape(tensor=mean, shape=(-1, num_action)) flat_stddev = tf.reshape(tensor=stddev, shape=(-1, num_action)) # Advantage computation # Network outputs entries of lower triangular matrix L if self.l_entries[name] is None: l_matrix = flat_stddev l_matrix = tf.exp(l_matrix) else: l_matrix = tf.linalg.diag(diagonal=flat_stddev) l_entries = self.l_entries[name].apply(x=embedding) l_entries = tf.exp(l_entries) offset = 0 columns = list() for zeros, size in enumerate(range(num_action - 1, -1, -1), 1): column = tf.pad(tensor=l_entries[:, offset: offset + size], paddings=((0, 0), (zeros, 0))) columns.append(column) offset += size l_matrix += tf.stack(values=columns, axis=1) # P = LL^T p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1))) # A = -0.5 (a - mean)P(a - mean) flat_action = tf.reshape(tensor=action, shape=(-1, num_action)) difference = flat_action - flat_mean advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2)) advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage) advantage = tf.squeeze(input=(-advantage / 2.0), axis=2) # Q = A + V # State-value function state_value = self.state_values[name].apply(x=embedding) q_value = state_value + advantage return tf.reshape(tensor=q_value, shape=((-1,) + self.actions_spec[name]['shape']))