def state_value(self, *, states, horizons, internals, auxiliaries): if self.state_value_mode == 'separate': deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply( x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=True ) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) return self.s_value.apply(x=embedding.get('state-embedding', embedding['embedding'])) else: return super().state_value( states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries )
def state_values(self, *, states, horizons, internals, auxiliaries): deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply( x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=True ) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) if self.state_value_mode == 'implicit': def function(name, spec, a_value): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) action_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) action_value = tf.reshape(tensor=action_value, shape=shape) if spec.type == 'bool': return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) elif spec.type == 'int': if self.config.enable_int_action_masking: mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.reduce_max(input_tensor=action_value, axis=-1) return self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) ) elif self.state_value_mode == 'separate': state_value = self.s_value.apply( x=embedding.get('state-embedding', embedding['embedding']) ) def function(name, spec, a_value): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) advantage_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) advantage_value = tf.reshape(tensor=advantage_value, shape=shape) mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) shape = (-1,) + tuple(1 for _ in range(spec.rank + 1)) _state_value = tf.reshape(tensor=state_value, shape=shape) action_value = _state_value + (advantage_value - mean) if spec.type == 'bool': return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1]) elif spec.type == 'int': if self.config.enable_int_action_masking: mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.reduce_max(input_tensor=action_value, axis=-1) return self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) ) elif self.state_value_mode == 'separate-per-action': def function(name, spec, s_value): if name is None: x = embedding.get('state-embedding', embedding['embedding']) else: x = embedding.get(name + '-state-embedding', embedding['embedding']) state_value = s_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape elif spec.type == 'int': shape = (-1,) + spec.shape return tf.reshape(tensor=state_value, shape=shape) return self.actions_spec.fmap( function=function, cls=TensorDict, zip_values=(self.s_values,) )
def action_values(self, *, states, horizons, internals, auxiliaries, actions): deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply( x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=True ) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) if self.state_value_mode == 'implicit': def function(name, spec, a_value, action): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) action_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) action_value = tf.reshape(tensor=action_value, shape=shape) if spec.type == 'bool': return tf.where( condition=action, x=action_value[..., 0], y=action_value[..., 1] ) elif spec.type == 'int': action = tf.expand_dims(input=action, axis=(spec.rank + 1)) action_value = tf.gather( params=action_value, indices=action, batch_dims=(spec.rank + 1) ) return tf.squeeze(input=action_value, axis=(spec.rank + 1)) return self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values, actions) ) elif self.state_value_mode == 'separate': state_value = self.s_value.apply( x=embedding.get('state-embedding', embedding['embedding']) ) def function(name, spec, a_value, action): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) advantage_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) advantage_value = tf.reshape(tensor=advantage_value, shape=shape) mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) shape = (-1,) + tuple(1 for _ in range(spec.rank + 1)) _state_value = tf.reshape(tensor=state_value, shape=shape) action_value = _state_value + (advantage_value - mean) if spec.type == 'bool': return tf.where( condition=action, x=action_value[..., 0], y=action_value[..., 1] ) elif spec.type == 'int': action = tf.expand_dims(input=action, axis=(spec.rank + 1)) action_value = tf.gather( params=action_value, indices=action, batch_dims=(spec.rank + 1) ) return tf.squeeze(input=action_value, axis=(spec.rank + 1)) return self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values, actions) ) elif self.state_value_mode == 'separate-per-action': def function(name, spec, s_value, a_value, action): if name is None: state_value = s_value.apply( x=embedding.get('state-embedding', embedding['embedding']) ) advantage_value = a_value.apply( x=embedding.get('action-embedding', embedding['embedding']) ) else: state_value = s_value.apply( x=embedding.get(name + '-state-embedding', embedding['embedding']) ) advantage_value = a_value.apply( x=embedding.get(name + '-embedding', embedding['embedding']) ) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) advantage_value = tf.reshape(tensor=advantage_value, shape=shape) mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) action_value = tf.expand_dims(input=state_value, axis=-1) + (advantage_value - mean) if spec.type == 'bool': return tf.where( condition=action, x=action_value[..., 0], y=action_value[..., 1] ) elif spec.type == 'int': action = tf.expand_dims(input=action, axis=(spec.rank + 1)) action_value = tf.gather( params=action_value, indices=action, batch_dims=(spec.rank + 1) ) return tf.squeeze(input=action_value, axis=(spec.rank + 1)) return self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.s_values, self.a_values, actions) )
def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent): embedding, internals = self.network.apply( x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent ) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) if self.state_value_mode == 'implicit': def function(name, spec, a_value): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) action_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) return tf.reshape(tensor=action_value, shape=shape) action_values = self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) ) elif self.state_value_mode == 'separate': state_value = self.s_value.apply( x=embedding.get('state-embedding', embedding['embedding']) ) def function(name, spec, a_value): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) advantage_value = a_value.apply(x=x) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) advantage_value = tf.reshape(tensor=advantage_value, shape=shape) mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) shape = (-1,) + tuple(1 for _ in range(spec.rank + 1)) _state_value = tf.reshape(tensor=state_value, shape=shape) return _state_value + (advantage_value - mean) action_values = self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,) ) elif self.state_value_mode == 'separate-per-action': def function(name, spec, s_value, a_value): if name is None: state_value = s_value.apply( x=embedding.get('state-embedding', embedding['embedding']) ) advantage_value = a_value.apply( x=embedding.get('action-embedding', embedding['embedding']) ) else: state_value = s_value.apply( x=embedding.get(name + '-state-embedding', embedding['embedding']) ) advantage_value = a_value.apply( x=embedding.get(name + '-embedding', embedding['embedding']) ) if spec.type == 'bool': shape = (-1,) + spec.shape + (2,) elif spec.type == 'int': shape = (-1,) + spec.shape + (spec.num_values,) advantage_value = tf.reshape(tensor=advantage_value, shape=shape) mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True) return tf.expand_dims(input=state_value, axis=-1) + (advantage_value - mean) action_values = self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(self.s_values, self.a_values) ) def function(name, spec, action_value): if spec.type == 'bool': def fn_summary(): axis = range(spec.rank + 1) values = tf.math.reduce_mean(input_tensor=action_value, axis=axis) return [values[0], values[1]] if name is None: names = ['action-values/true', 'action-values/false'] else: names = ['action-values/' + name + '-true', 'action-values/' + name + '-false'] dependencies = self.summary( label='action-value', name=names, data=fn_summary, step='timesteps' ) def fn_tracking(): return tf.math.reduce_mean(input_tensor=action_value, axis=0) if name is None: n = 'action-values' else: n = name + '-values' dependencies = self.track(label='action-value', name=n, data=fn_tracking) with tf.control_dependencies(control_inputs=dependencies): return (action_value[..., 0] > action_value[..., 1]) elif spec.type == 'int': def fn_summary(): axis = range(spec.rank + 1) values = tf.math.reduce_mean(input_tensor=action_value, axis=axis) return [values[n] for n in range(spec.num_values)] if name is None: prefix = 'action-values/action' else: prefix = 'action-values/' + name + '-action' names = [prefix + str(n) for n in range(spec.num_values)] dependencies = self.summary( label='action-value', name=names, data=fn_summary, step='timesteps' ) def fn_tracking(): return tf.math.reduce_mean(input_tensor=action_value, axis=0) if name is None: n = 'action-values' else: n = name + '-values' dependencies = self.track(label='action-value', name=n, data=fn_tracking) with tf.control_dependencies(control_inputs=dependencies): if self.config.enable_int_action_masking: mask = auxiliaries[name]['mask'] min_float = tf_util.get_dtype(type='float').min min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float) action_value = tf.where(condition=mask, x=action_value, y=min_float) return tf.math.argmax(input=action_value, axis=-1, output_type=spec.tf_type()) actions = self.actions_spec.fmap( function=function, cls=TensorDict, with_names=True, zip_values=(action_values,) ) return actions, internals