def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): # if self.value == 'state': # if self.early_reduce: # value = policy.state_value( # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries # ) # else: # value = policy.state_values( # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries # ) # value = tf.concat(values=tuple(value.values()), axis=1) # elif self.value == 'action': # if self.early_reduce: # value = policy.action_value( # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, # actions=actions # ) # else: # value = policy.action_values( # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, # actions=actions # ) # value = tf.concat(values=tuple(value.values()), axis=1) return tf_util.zeros(shape=tf.shape(input=actions.value())[:1], dtype='float')
def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): assert len(internals) == 0 actions = TensorDict() for name, spec in self.actions_spec.items(): shape = tf.concat(values=( tf_util.cast(x=tf.shape(input=states.value())[:1], dtype='int'), tf_util.constant(value=spec.shape, dtype='int') ), axis=0) if self.action_values is not None and name in self.action_values: # If user-specified, choose given action action = tf_util.constant(value=self.action_values[name], dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) elif self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # If masking, choose first unmasked action mask = auxiliaries[name]['mask'] choices = tf_util.constant( value=list(range(spec.num_values)), dtype='int', shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values)) ) one = tf_util.constant(value=1, dtype='int', shape=(1,)) multiples = tf.concat(values=(shape, one), axis=0) choices = tf.tile(input=choices, multiples=multiples) choices = tf.boolean_mask(tensor=choices, mask=mask) mask = tf_util.cast(x=mask, dtype='int') num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) num_valid = tf.reshape(tensor=num_valid, shape=(-1,)) masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) action = tf.gather(params=choices, indices=masked_offset) actions[name] = tf.reshape(tensor=action, shape=shape) elif spec.type != 'bool' and spec.min_value is not None: if spec.max_value is not None: # If min/max_value given, choose mean action action = spec.min_value + 0.5 * (spec.max_value - spec.min_value) action = tf_util.constant(value=action, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) else: # If only min_value given, choose min_value action = tf_util.constant(value=spec.min_value, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) elif spec.type != 'bool' and spec.max_value is not None: # If only max_value given, choose max_value action = tf_util.constant(value=spec.max_value, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) else: # Else choose zero actions[name] = tf_util.zeros(shape=shape, dtype=spec.type) return actions, TensorDict()
def reference(self, *, states, horizons, internals, auxiliaries, actions, policy): # deterministic = tf_util.constant(value=True, dtype='bool') # return policy.act( # states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries, # deterministic=deterministic, independent=True # ) return tf_util.zeros(shape=(tf.shape(input=actions.value())[0], ), dtype='float')
def no_sync(): next_sync_updated = self.next_sync.assign_sub(delta=one, read_value=False) with tf.control_dependencies(control_inputs=(next_sync_updated, )): deltas = list() for variable in variables: delta = tf_util.zeros(shape=tf_util.shape(x=variable), dtype='float') deltas.append(delta) return deltas
def independent_act(self, *, states, internals=None, auxiliaries=None): if internals is None: assert len(self.internals_spec) == 0 internals = TensorDict() if auxiliaries is None: assert len(self.auxiliaries_spec) == 0 auxiliaries = TensorDict() true = tf_util.constant(value=True, dtype='bool') batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int') # Input assertions assertions = list() if self.config.create_tf_assertions: assertions.extend(self.states_spec.tf_assert( x=states, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} state input.' )) assertions.extend(self.internals_spec.tf_assert( x=internals, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} internal input.' )) assertions.extend(self.auxiliaries_spec.tf_assert( x=auxiliaries, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} input.' )) # Mask assertions if self.config.enable_int_action_masking: for name, spec in self.actions_spec.items(): if spec.type == 'int': assertions.append(tf.debugging.assert_equal( x=tf.reduce_all(input_tensor=tf.math.reduce_any( input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1) )), y=true, message="Agent.independent_act: at least one action has to be valid." )) with tf.control_dependencies(control_inputs=assertions): # Core act parallel = tf_util.zeros(shape=(1,), dtype='int') actions, internals = self.core_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, independent=True ) # Skip action assertions # SavedModel requires flattened output if len(self.internals_spec) > 0: return OrderedDict(TensorDict(actions=actions, internals=internals)) else: return OrderedDict(actions)
def iterative_body(self, x, indices, remaining, current_x, current_internals): batch_size = tf_util.cast(x=tf.shape(input=current_x)[:1], dtype='int') zeros = tf_util.zeros(shape=batch_size, dtype='int') ones = tf_util.ones(shape=batch_size, dtype='int') batch_size = batch_size[0] current_x = tf.gather(params=x, indices=indices) next_x, next_internals = self.iterative_apply( x=current_x, internals=current_internals) with tf.control_dependencies(control_inputs=(current_x, next_x)): is_finished = tf.math.equal(x=remaining, y=zeros) if isinstance(next_internals, dict): for name, current_internal, next_internal in current_internals.zip_items( next_internals): condition = is_finished for _ in range(tf_util.rank(x=current_internal) - 1): condition = tf.expand_dims(input=condition, axis=1) next_internals[name] = tf.where(condition=condition, x=current_internal, y=next_internal) else: condition = is_finished for _ in range(tf_util.rank(x=current_internals) - 1): condition = tf.expand_dims(input=condition, axis=1) next_internals = tf.where(condition=condition, x=current_internals, y=next_internals) remaining -= tf.where(condition=is_finished, x=zeros, y=ones) indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) return x, indices, remaining, next_x, next_internals
def fn_terminal(): operations = list() # Reset internals def function(spec, initial): return tf_util.constant(value=initial, dtype=spec.type) initials = self.internals_spec.fmap( function=function, cls=TensorDict, zip_values=self.initial_internals) for name, previous, initial in self.previous_internals.zip_items( initials): updates = tf.expand_dims(input=initial, axis=0) value = tf.tensor_scatter_nd_update( tensor=previous, indices=expanded_parallel, updates=updates) operations.append(previous.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=initial, indices=parallel) # operations.append(previous.scatter_update(sparse_delta=sparse_delta)) # Episode length/reward summaries (before episode reward reset / episodes increment) dependencies = list() if self.summaries == 'all' or 'reward' in self.summaries: with self.summarizer.as_default(): x = tf.gather(params=self.episode_length, indices=parallel) dependencies.append( tf.summary.scalar(name='episode-length', data=x, step=self.episodes)) x = tf.gather(params=self.episode_reward, indices=parallel) dependencies.append( tf.summary.scalar(name='episode-reward', data=x, step=self.episodes)) # Reset episode length/reward with tf.control_dependencies(control_inputs=dependencies): zeros = tf_util.zeros(shape=(1, ), dtype='int') value = tf.tensor_scatter_nd_update( tensor=self.episode_length, indices=expanded_parallel, updates=zeros) operations.append(self.episode_length.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel) # operations.append(self.episode_length.scatter_update(sparse_delta=sparse_delta)) zeros = tf_util.zeros(shape=(1, ), dtype='float') value = tf.tensor_scatter_nd_update( tensor=self.episode_reward, indices=expanded_parallel, updates=zeros) operations.append(self.episode_reward.assign(value=value)) # zero_float = tf_util.constant(value=0.0, dtype='float') # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel) # operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta)) # Increment episodes counter operations.append( self.episodes.assign_add(delta=one, read_value=False)) return tf.group(*operations)
def variable(self, *, name, spec, initializer, is_trainable, is_saved, initialization_scale=None): assert self.is_initialized is False # name if not isinstance(name, str): raise TensorforceError.type(name='variable', argument='name', dtype=type(name)) # spec if not isinstance(spec, TensorSpec): raise TensorforceError.dtype(name='variable', argument='spec', dtype=type(spec)) if spec.is_underspecified(): raise TensorforceError.value(name='variable', argument='spec', value=spec, hint='underspecified') # initializer initializer_names = ('constant', 'normal', 'normal-relu', 'ones', 'orthogonal', 'orthogonal-relu', 'zeros') if not isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) and \ initializer not in initializer_names: raise TensorforceError.value(name='variable', argument='initializer', value=initializer) elif isinstance(initializer, np.ndarray) and initializer.dtype != spec.np_type(): raise TensorforceError.type(name='variable', argument='initializer', dtype=initializer.dtype) elif isinstance( initializer, tf.Tensor) and tf_util.dtype(x=initializer) != spec.tf_type(): raise TensorforceError.type(name='variable', argument='initializer', dtype=tf_util.dtype(x=initializer)) # initialization_scale if initialization_scale is not None: if isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) or \ initializer not in ('constant', 'orthogonal', 'orthogonal-relu'): raise TensorforceError.invalid( name='variable', argument='initialization_scale', condition='initializer not orthogonal') elif not isinstance(initialization_scale, spec.py_type()): raise TensorforceError.type(name='variable', argument='initialization_scale', dtype=type(initialization_scale), hint='!= float') # is_trainable if not isinstance(is_trainable, bool): raise TensorforceError.type(name='variable', argument='is_trainable', dtype=type(is_trainable)) elif is_trainable and spec.type != 'float': raise TensorforceError.value(name='variable', argument='is_trainable', value=is_trainable, condition='spec.type != float') # is_saved if not isinstance(is_saved, bool): raise TensorforceError.type(name='variable', argument='is_saved', dtype=type(is_saved)) # Variable initializer if isinstance(initializer, spec.py_type()): initializer = tf_util.constant(value=initializer, dtype=spec.type, shape=spec.shape) elif isinstance(initializer, np.ndarray): if initializer.shape != spec.shape: raise TensorforceError.mismatch(name='Module.variable', value1='shape', value2='initializer') initializer = tf_util.constant(value=initializer, dtype=spec.type) elif isinstance(initializer, tf.Tensor): if tf_util.shape(x=initializer) != spec.shape: raise TensorforceError.mismatch(name='Module.variable', value1='shape', value2='initializer') initializer = initializer elif not isinstance(initializer, str): raise TensorforceError( "Invalid variable initializer: {}".format(initializer)) elif initializer.startswith('normal'): if spec.type != 'float': raise TensorforceError( message= "Invalid variable initializer value for non-float variable: {}." .format(initializer)) if initializer.endswith('-relu'): stddev = min(0.1, np.sqrt(2.0 / util.product(xs=spec.shape[:-1]))) else: stddev = min( 0.1, np.sqrt( 2.0 / (util.product(xs=spec.shape[:-1]) + spec.shape[-1]))) initializer = tf.random.normal(shape=spec.shape, stddev=stddev, dtype=spec.tf_type()) elif initializer.startswith('orthogonal'): if spec.type != 'float': raise TensorforceError( message= "Invalid variable initializer value for non-float variable: {}." .format(initializer)) if spec.rank < 2: raise TensorforceError( message= "Invalid variable initializer value for 0/1-rank variable: {}." .format(initializer)) normal = np.random.normal(size=(util.product(xs=spec.shape[:-1]), spec.shape[-1])) u, _, v = np.linalg.svd(a=normal, full_matrices=False) orthogonal = u if u.shape[1] == spec.shape[-1] else v if initializer.endswith('-relu'): orthogonal = orthogonal * np.sqrt(2.0) if initialization_scale is not None and initialization_scale != 1.0: if initialization_scale <= 0.0: raise TensorforceError.value( name='variable', argument='initialization_scale', value=initialization_scale, hint='<= 0.0') orthogonal = orthogonal * initialization_scale initializer = tf_util.constant(value=orthogonal.reshape( spec.shape), dtype=spec.type) elif initializer == 'zeros': initializer = tf_util.zeros(shape=spec.shape, dtype=spec.type) elif initializer == 'ones': initializer = tf_util.ones(shape=spec.shape, dtype=spec.type) elif initializer == 'constant': initializer = tf.fill(dims=spec.shape, value=tf_util.constant( value=initialization_scale, dtype=spec.type)) # Variable variable = tf.Variable(initial_value=initializer, trainable=is_trainable, validate_shape=True, name=name, dtype=spec.tf_type(), shape=spec.shape) variable.is_saved = is_saved return variable
def apply(self, *, x, horizons, internals): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') batch_size = tf_util.cast(x=tf.shape(input=horizons)[0], dtype='int') zeros = tf_util.zeros(shape=(batch_size, ), dtype='int') ones = tf_util.ones(shape=(batch_size, ), dtype='int') # including 0th step horizon = self.horizon.value() + one # in case of longer horizon than necessary (e.g. main vs baseline policy) starts = horizons[:, 0] + tf.maximum(x=(horizons[:, 1] - horizon), y=zeros) lengths = horizons[:, 1] - tf.maximum(x=(horizons[:, 1] - horizon), y=zeros) horizon = tf.minimum(x=horizon, y=tf.math.reduce_max(input_tensor=lengths, axis=0)) output_spec = self.output_spec() if self.temporal_processing == 'cumulative': if self.horizon.is_constant(value=0): x = self.iterative_apply(xs=x, lengths=ones) else: def body(x, indices, remaining, xs): current_x = tf.gather(params=x, indices=indices) current_x = tf.expand_dims(input=current_x, axis=1) xs = tf.concat(values=(xs, current_x), axis=1) remaining -= tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) return x, indices, remaining, xs initial_xs = tf_util.zeros(shape=((batch_size, 0) + output_spec.shape), dtype=output_spec.type) _, final_indices, final_remaining, xs = tf.while_loop( cond=tf_util.always_true, body=body, loop_vars=(x, starts, lengths, initial_xs), maximum_iterations=tf_util.int64(x=horizon)) x = self.cumulative_apply(xs=xs, lengths=lengths) elif self.temporal_processing == 'iterative': if self.horizon.is_constant(value=0): x, final_internals = self.iterative_apply(x=x, internals=internals) else: initial_x = tf_util.zeros(shape=((batch_size, ) + output_spec.shape), dtype=output_spec.type) signature = self.input_signature(function='iterative_body') internals = signature['current_internals'].kwargs_to_args( kwargs=internals) _, final_indices, final_remaining, x, final_internals = tf.while_loop( cond=tf_util.always_true, body=self.iterative_body, loop_vars=(x, starts, lengths, initial_x, internals), maximum_iterations=tf_util.int32(x=horizon)) internals = signature['current_internals'].args_to_kwargs( args=final_internals) assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_equal(x=final_indices, y=(tf.math.cumsum(x=lengths) - ones))) assertions.append( tf.debugging.assert_equal( x=tf.math.reduce_sum(input_tensor=final_remaining), y=zero)) with tf.control_dependencies(control_inputs=assertions): if self.temporal_processing == 'cumulative': return tf_util.identity(input=super().apply(x=x)) elif self.temporal_processing == 'iterative': return tf_util.identity(input=super().apply(x=x)), internals