def tf_sample(self, parameters, deterministic): mean, stddev, _ = parameters # Deterministic: mean as action definite = mean # Non-deterministic: sample action using default normal distribution normal_distribution = tf.random.normal( shape=tf.shape(input=mean), dtype=util.tf_dtype(dtype='float') ) sampled = mean + stddev * normal_distribution action = tf.where(condition=deterministic, x=definite, y=sampled) # Clip if bounded action if 'min_value' in self.action_spec: min_value = tf.constant( value=self.action_spec['min_value'], dtype=util.tf_dtype(dtype='float') ) max_value = tf.constant( value=self.action_spec['max_value'], dtype=util.tf_dtype(dtype='float') ) action = tf.clip_by_value(t=action, clip_value_min=min_value, clip_value_max=max_value) return action
def tf_parametrize(self, x, mask): epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) shape = (-1,) + self.action_spec['shape'] + (self.action_spec['num_values'],) value_shape = (-1,) + self.action_spec['shape'] + (1,) # Deviations action_values = self.deviations.apply(x=x) action_values = tf.reshape(tensor=action_values, shape=shape) min_float = tf.fill( dims=tf.shape(input=action_values), value=util.tf_dtype(dtype='float').min ) # States value if self.value is None: action_values = tf.where(condition=mask, x=action_values, y=min_float) states_value = tf.reduce_logsumexp(input_tensor=action_values, axis=-1) else: states_value = self.value.apply(x=x) if len(self.embedding_shape) == 1: states_value = tf.reshape(tensor=states_value, shape=value_shape) action_values = states_value + action_values - tf.math.reduce_mean( input_tensor=action_values, axis=-1, keepdims=True ) states_value = tf.squeeze(input=states_value, axis=-1) action_values = tf.where(condition=mask, x=action_values, y=min_float) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=action_values, axis=-1) # "Normalized" logits logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon)) Module.update_tensor(name=(self.name + '-probabilities'), tensor=probabilities) return logits, probabilities, states_value, action_values
def apply_update(): one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float')) axes = tuple(1 + axis for axis in self.axes) decay = self.decay.value() batch_size = tf.dtypes.cast(x=tf.shape(input=x)[0], dtype=util.tf_dtype(dtype='float')) decay = tf.math.pow(x=decay, y=batch_size) mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) mean = tf.where( condition=self.after_first_call, x=(decay * self.moving_mean + (one - decay) * mean), y=mean ) variance = tf.reduce_mean( input_tensor=tf.math.squared_difference(x=x, y=mean), axis=axes, keepdims=True ) variance = tf.where( condition=self.after_first_call, x=(decay * self.moving_variance + (one - decay) * variance), y=variance ) with tf.control_dependencies(control_inputs=(mean, variance)): assignment = self.after_first_call.assign( value=tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')), read_value=False ) with tf.control_dependencies(control_inputs=(assignment,)): variance = self.moving_variance.assign(value=variance) mean = self.moving_mean.assign(value=mean) return mean, variance
def tf_retrieve_timesteps(self, n): one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long')) capacity = tf.constant(value=self.capacity, dtype=util.tf_dtype(dtype='long')) # Start index of oldest episode oldest_episode_start = self.terminal_indices[0] + one # Number of timesteps (minus/plus one to prevent zero but allow capacity) num_timesteps = self.memory_index - oldest_episode_start - one num_timesteps = tf.mod(x=num_timesteps, y=capacity) + one # Check whether memory contains enough timesteps assertion = tf.debugging.assert_less_equal(x=n, y=num_timesteps) # Randomly sampled timestep indices with tf.control_dependencies(control_inputs=(assertion,)): indices = tf.random_uniform( shape=(n,), maxval=num_timesteps, dtype=util.tf_dtype(dtype='long') ) indices = tf.mod(x=(self.memory_index - one - indices), y=capacity) # Retrieve timestep indices timesteps = self.retrieve_indices(indices=indices) return timesteps
def tf_retrieve_timesteps(self, n, past_padding, future_padding): one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long')) capacity = tf.constant(value=self.capacity, dtype=util.tf_dtype(dtype='long')) # # Start index of oldest episode # oldest_episode_start = self.terminal_indices[0] + one + past_padding # # Number of timesteps (minus/plus one to prevent zero but allow capacity) # num_timesteps = self.buffer_index - oldest_episode_start - future_padding - one # num_timesteps = tf.mod(x=num_timesteps, y=capacity) + one # Check whether memory contains enough timesteps num_timesteps = tf.minimum(x=self.buffer_index, y=capacity) - past_padding - future_padding assertion = tf.debugging.assert_less_equal(x=n, y=num_timesteps) # Most recent timestep indices range with tf.control_dependencies(control_inputs=( assertion, )): # Assertions in memory as warning!!! indices = tf.range(start=(self.buffer_index - n), limit=self.buffer_index) indices = tf.mod(x=(indices - future_padding), y=capacity) return indices
def tf_step(self, x, conjugate, residual, squared_residual): """ Iteration loop body of the conjugate gradient algorithm. Args: x: Current solution estimate $x_t$. conjugate: Current conjugate $c_t$. residual: Current residual $r_t$. squared_residual: Current squared residual $r_t^2$. Returns: Updated arguments for next iteration. """ # Ac := A * c_t A_conjugate = self.fn_x(conjugate) # TODO: reference? damping = self.damping.value() def no_damping(): return A_conjugate def apply_damping(): return [A_conj + damping * conj for A_conj, conj in zip(A_conjugate, conjugate)] zero = tf.constant(value=0.0, dtype=util.tf_dtype(dtype='float')) skip_damping = tf.math.equal(x=damping, y=zero) A_conjugate = self.cond(pred=skip_damping, true_fn=no_damping, false_fn=apply_damping) # cAc := c_t^T * Ac conjugate_A_conjugate = tf.add_n( inputs=[ tf.reduce_sum(input_tensor=(conj * A_conj)) for conj, A_conj in zip(conjugate, A_conjugate) ] ) # \alpha := r_t^2 / cAc epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) alpha = squared_residual / tf.maximum(x=conjugate_A_conjugate, y=epsilon) # x_{t+1} := x_t + \alpha * c_t next_x = [t + alpha * conj for t, conj in zip(x, conjugate)] # r_{t+1} := r_t - \alpha * Ac next_residual = [res - alpha * A_conj for res, A_conj in zip(residual, A_conjugate)] # r_{t+1}^2 := r_{t+1}^T * r_{t+1} next_squared_residual = tf.add_n( inputs=[tf.reduce_sum(input_tensor=(res * res)) for res in next_residual] ) # \beta = r_{t+1}^2 / r_t^2 beta = next_squared_residual / tf.maximum(x=squared_residual, y=epsilon) # c_{t+1} := r_{t+1} + \beta * c_t next_conjugate = [res + beta * conj for res, conj in zip(next_residual, conjugate)] return next_x, next_conjugate, next_residual, next_squared_residual
def create_tf_operations(self, config): """ Creates generic TensorFlow operations and placeholders required for models. Args: config: Model configuration which must contain entries for states and actions. Returns: """ self.action_taken = dict() self.internal_inputs = list() self.internal_outputs = list() self.internal_inits = list() # Placeholders with tf.variable_scope('placeholder'): # States self.state = dict() for name, state in config.states.items(): self.state[name] = tf.placeholder(dtype=util.tf_dtype(state.type), shape=(None,) + tuple(state.shape), name=name) # Actions self.action = dict() self.discrete_actions = [] self.continuous_actions = [] for name, action in config.actions: if action.continuous: if not self.__class__.allows_continuous_actions: raise TensorForceError("Error: Model does not support continuous actions.") self.action[name] = tf.placeholder(dtype=util.tf_dtype('float'), shape=(None,), name=name) else: if not self.__class__.allows_discrete_actions: raise TensorForceError("Error: Model does not support discrete actions.") self.action[name] = tf.placeholder(dtype=util.tf_dtype('int'), shape=(None,), name=name) # Reward & terminal self.reward = tf.placeholder(dtype=tf.float32, shape=(None,), name='reward') self.terminal = tf.placeholder(dtype=tf.bool, shape=(None,), name='terminal') # Deterministic action flag self.deterministic = tf.placeholder(dtype=tf.bool, shape=(), name='deterministic') # Optimizer if config.optimizer is not None: learning_rate = config.learning_rate with tf.variable_scope('optimization'): optimizer = util.function(config.optimizer, optimizers) args = config.optimizer_args or () kwargs = config.optimizer_kwargs or {} self.optimizer = optimizer(learning_rate, *args, **kwargs) else: self.optimizer = None