示例#1
0
    def tf_parametrize(self, x):
        log_epsilon = tf.constant(value=log(util.epsilon), dtype=util.tf_dtype(dtype='float'))
        shape = (-1,) + self.action_spec['shape']

        # Mean
        mean = self.mean.apply(x=x)
        mean = tf.reshape(tensor=mean, shape=shape)

        # Log standard deviation
        log_stddev = self.log_stddev.apply(x=x)
        log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Clip log_stddev for numerical stability
        # epsilon < 1.0, hence negative
        log_stddev = tf.clip_by_value(
            t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon
        )

        # Standard deviation
        stddev = tf.exp(x=log_stddev)

        Module.update_tensor(name=(self.name + '-mean'), tensor=mean)
        Module.update_tensor(name=(self.name + '-stddev'), tensor=stddev)
        mean, log_stddev = self.add_summary(
            label=('distributions', 'gaussian'), name='mean', tensor=mean,
            pass_tensors=(mean, log_stddev)
        )
        stddev, log_stddev = self.add_summary(
            label=('distributions', 'gaussian'), name='stddev', tensor=stddev,
            pass_tensors=(stddev, log_stddev)
        )

        return mean, stddev, log_stddev
示例#2
0
    def tf_parametrize(self, x, mask):
        epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float'))
        shape = (-1,) + self.action_spec['shape'] + (self.action_spec['num_values'],)

        # Logits
        logits = self.logits.apply(x=x)
        logits = tf.reshape(tensor=logits, shape=shape)
        min_float = tf.fill(dims=tf.shape(input=logits), value=util.tf_dtype(dtype='float').min)
        logits = tf.where(condition=mask, x=logits, y=min_float)

        # States value
        states_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=logits, axis=-1)

        # "Normalized" logits
        logits = tf.log(x=tf.maximum(x=probabilities, y=epsilon))

        # Logits as pass_tensor since used for sampling
        Module.update_tensor(name=(self.name + '-probabilities'), tensor=probabilities)
        logits, probabilities, states_value = self.add_summary(
            label=('distributions', 'categorical'), name='probabilities', tensor=probabilities,
            pass_tensors=(logits, probabilities, states_value), enumerate_last_rank=True
        )

        return logits, probabilities, states_value
    def tf_baseline_loss(self, states, internals, reward, reference=None):
        """
        Creates the TensorFlow operations for calculating the baseline loss of a batch.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            reward: Reward tensor.
            reference: Optional reference tensor(s), in case of a comparative loss.

        Returns:
            Loss tensor.
        """
        Module.update_tensors(**states, **internals, reward=reward)
        if self.baseline_mode == 'states':
            loss = self.baseline.total_loss(states=states,
                                            internals=internals,
                                            reward=reward)

        elif self.baseline_mode == 'network':
            embedding = self.network.apply(x=states, internals=internals)
            embedding = tf.stop_gradient(input=embedding)
            Module.update_tensors(embedding=embedding)
            loss = self.baseline.total_loss(
                states=OrderedDict(embedding=embedding),
                internals=internals,
                reward=reward)

        regularization_loss = self.baseline.regularize()
        if regularization_loss is not None:
            loss += regularization_loss

        return loss
示例#4
0
    def tf_core_update(self):
        Module.update_tensor(name='update', tensor=self.global_update)

        true = tf.constant(value=True, dtype=util.tf_dtype(dtype='bool'))
        one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long'))

        # Retrieve batch
        batch_size = self.update_batch_size.value()
        if self.update_unit == 'timesteps':
            # Timestep-based batch
            # Dependency horizon
            past_horizon = self.policy.past_horizon(is_optimization=True)
            past_horizon = tf.math.maximum(
                x=past_horizon, y=self.baseline_policy.past_horizon(is_optimization=True)
            )
            future_horizon = self.estimator.future_horizon()
            indices = self.memory.retrieve_timesteps(
                n=batch_size, past_horizon=past_horizon, future_horizon=future_horizon
            )
        elif self.update_unit == 'episodes':
            # Episode-based batch
            indices = self.memory.retrieve_episodes(n=batch_size)

        # Optimization
        optimized = self.optimize(indices=indices)

        # Increment update
        with tf.control_dependencies(control_inputs=(optimized,)):
            assignment = self.global_update.assign_add(delta=one, read_value=False)

        with tf.control_dependencies(control_inputs=(assignment,)):
            return util.identity_operation(x=true)
示例#5
0
    def __init__(self, name, action_spec, embedding_size, summary_labels=None):
        super().__init__(name=name,
                         action_spec=action_spec,
                         embedding_size=embedding_size,
                         summary_labels=summary_labels)

        action_size = util.product(xs=self.action_spec['shape'], empty=0)
        input_spec = dict(type='float', shape=(self.embedding_size, ))
        self.mean = self.add_module(name='mean',
                                    module='linear',
                                    modules=layer_modules,
                                    size=action_size,
                                    input_spec=input_spec)
        self.log_stddev = self.add_module(name='log-stddev',
                                          module='linear',
                                          modules=layer_modules,
                                          size=action_size,
                                          input_spec=input_spec)

        Module.register_tensor(name=(self.name + '-mean'),
                               spec=dict(type='float',
                                         shape=self.action_spec['shape']),
                               batched=True)
        Module.register_tensor(name=(self.name + '-stddev'),
                               spec=dict(type='float',
                                         shape=self.action_spec['shape']),
                               batched=True)
示例#6
0
    def tf_parametrize(self, x, mask):
        epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float'))
        shape = (-1,) + self.action_spec['shape'] + (self.action_spec['num_values'],)
        value_shape = (-1,) + self.action_spec['shape'] + (1,)

        # Deviations
        action_values = self.deviations.apply(x=x)
        action_values = tf.reshape(tensor=action_values, shape=shape)
        min_float = tf.fill(
            dims=tf.shape(input=action_values), value=util.tf_dtype(dtype='float').min
        )

        # States value
        if self.value is None:
            action_values = tf.where(condition=mask, x=action_values, y=min_float)
            states_value = tf.reduce_logsumexp(input_tensor=action_values, axis=-1)
        else:
            states_value = self.value.apply(x=x)
            if len(self.embedding_shape) == 1:
                states_value = tf.reshape(tensor=states_value, shape=value_shape)
            action_values = states_value + action_values - tf.math.reduce_mean(
                input_tensor=action_values, axis=-1, keepdims=True
            )
            states_value = tf.squeeze(input=states_value, axis=-1)
            action_values = tf.where(condition=mask, x=action_values, y=min_float)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=action_values, axis=-1)

        # "Normalized" logits
        logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon))

        Module.update_tensor(name=(self.name + '-probabilities'), tensor=probabilities)

        return logits, probabilities, states_value, action_values
示例#7
0
    def tf_apply(self, x):
        if len(self.tensors) == 1:
            if self.tensors == '*':
                return x
            else:
                return Module.retrieve_tensor(name=self.tensors[0])

        tensors = list()
        for tensor in self.tensors:
            if tensor == '*':
                tensors.append(x)
            else:
                tensors.append(Module.retrieve_tensor(name=tensor))

        shape = self.output_spec['shape']
        for n, tensor in enumerate(tensors):
            for axis in range(util.rank(x=tensor), len(shape)):
                tensor = tf.expand_dims(input=tensor, axis=axis)
            tensors[n] = tensor

        if self.aggregation == 'concat':
            x = tf.concat(values=tensors, axis=(self.axis + 1))

        elif self.aggregation == 'product':
            x = tf.stack(values=tensors, axis=(self.axis + 1))
            x = tf.reduce_prod(input_tensor=x, axis=(self.axis + 1))

        elif self.aggregation == 'stack':
            x = tf.stack(values=tensors, axis=(self.axis + 1))

        elif self.aggregation == 'sum':
            x = tf.stack(values=tensors, axis=(self.axis + 1))
            x = tf.reduce_sum(input_tensor=x, axis=(self.axis + 1))

        return x
示例#8
0
    def api_update(self):
        # Set global tensors
        Module.update_tensors(
            deterministic=tf.constant(value=True,
                                      dtype=util.tf_dtype(dtype='bool')),
            independent=tf.constant(value=False,
                                    dtype=util.tf_dtype(dtype='bool')),
            optimization=tf.constant(value=True,
                                     dtype=util.tf_dtype(dtype='bool')),
            timestep=self.global_timestep,
            episode=self.global_episode,
            update=self.global_update)

        # Core update: retrieve update operation
        updated = self.core_update()

        with tf.control_dependencies(control_inputs=(updated, )):
            # Function-level identity operation for retrieval (plus enforce dependency)
            timestep = util.identity_operation(
                x=self.global_timestep, operation_name='timestep-output')
            episode = util.identity_operation(x=self.global_episode,
                                              operation_name='episode-output')
            update = util.identity_operation(x=self.global_update,
                                             operation_name='update-output')

        return timestep, episode, update
示例#9
0
    def __init__(self,
                 name,
                 action_spec,
                 embedding_shape,
                 summary_labels=None):
        super().__init__(name=name,
                         action_spec=action_spec,
                         embedding_shape=embedding_shape,
                         summary_labels=summary_labels)

        input_spec = dict(type='float', shape=self.embedding_shape)

        if len(self.embedding_shape) == 1:
            action_size = util.product(xs=self.action_spec['shape'], empty=0)
            self.alpha = self.add_module(name='alpha',
                                         module='linear',
                                         modules=layer_modules,
                                         size=action_size,
                                         input_spec=input_spec)
            self.beta = self.add_module(name='beta',
                                        module='linear',
                                        modules=layer_modules,
                                        size=action_size,
                                        input_spec=input_spec)

        else:
            if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3:
                raise TensorforceError.value(name=name,
                                             argument='embedding_shape',
                                             value=self.embedding_shape,
                                             hint='invalid rank')
            if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]:
                size = self.action_spec['shape'][-1]
            elif self.embedding_shape[:-1] == self.action_spec['shape']:
                size = 0
            else:
                raise TensorforceError.value(
                    name=name,
                    argument='embedding_shape',
                    value=self.embedding_shape,
                    hint='not flattened and incompatible with action shape')
            self.alpha = self.add_module(name='alpha',
                                         module='linear',
                                         modules=layer_modules,
                                         size=size,
                                         input_spec=input_spec)
            self.beta = self.add_module(name='beta',
                                        module='linear',
                                        modules=layer_modules,
                                        size=size,
                                        input_spec=input_spec)

        Module.register_tensor(name=(self.name + '-alpha'),
                               spec=dict(type='float',
                                         shape=self.action_spec['shape']),
                               batched=True)
        Module.register_tensor(name=(self.name + '-beta'),
                               spec=dict(type='float',
                                         shape=self.action_spec['shape']),
                               batched=True)
示例#10
0
    def tf_states_value(self,
                        states,
                        internals,
                        auxiliaries,
                        reduced=True,
                        include_per_action=False):
        if self.value is None:
            return ActionValue.tf_states_value(
                self=self,
                states=states,
                internals=internals,
                auxiliaries=auxiliaries,
                reduced=reduced,
                include_per_action=include_per_action)

        else:
            if not reduced or include_per_action:
                raise TensorforceError.invalid(name='policy.states_value',
                                               argument='reduced')

            embedding = self.network.apply(x=states, internals=internals)
            Module.update_tensor(name=self.name, tensor=embedding)

            states_value = self.value.apply(x=embedding)
            return states_value
示例#11
0
    def tf_parametrize(self, x):
        # Softplus to ensure alpha and beta >= 1
        one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float'))
        epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float'))
        log_epsilon = tf.constant(value=log(util.epsilon), dtype=util.tf_dtype(dtype='float'))
        shape = (-1,) + self.action_spec['shape']

        # Alpha
        alpha = self.alpha.apply(x=x)
        # epsilon < 1.0, hence negative
        alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon)
        alpha = tf.math.softplus(features=alpha) + one
        if len(self.embedding_shape) == 1:
            alpha = tf.reshape(tensor=alpha, shape=shape)

        # Beta
        beta = self.beta.apply(x=x)
        # epsilon < 1.0, hence negative
        beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon)
        beta = tf.math.softplus(features=beta) + one
        if len(self.embedding_shape) == 1:
            beta = tf.reshape(tensor=beta, shape=shape)

        # Alpha + Beta
        alpha_beta = tf.maximum(x=(alpha + beta), y=epsilon)

        # Log norm
        log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta)

        Module.update_tensor(name=(self.name + '-alpha'), tensor=alpha)
        Module.update_tensor(name=(self.name + '-beta'), tensor=beta)

        return alpha, beta, alpha_beta, log_norm
示例#12
0
    def tf_parametrize(self, x):
        log_epsilon = tf.constant(value=log(util.epsilon),
                                  dtype=util.tf_dtype(dtype='float'))
        shape = (-1, ) + self.action_spec['shape']

        # Mean
        mean = self.mean.apply(x=x)
        if len(self.embedding_shape) == 1:
            mean = tf.reshape(tensor=mean, shape=shape)

        # Log standard deviation
        log_stddev = self.log_stddev.apply(x=x)
        if len(self.embedding_shape) == 1:
            log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Clip log_stddev for numerical stability
        # epsilon < 1.0, hence negative
        log_stddev = tf.clip_by_value(t=log_stddev,
                                      clip_value_min=log_epsilon,
                                      clip_value_max=-log_epsilon)

        # Standard deviation
        stddev = tf.exp(x=log_stddev)

        Module.update_tensor(name=(self.name + '-mean'), tensor=mean)
        Module.update_tensor(name=(self.name + '-stddev'), tensor=stddev)

        return mean, stddev, log_stddev
示例#13
0
    def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals):
        if return_internals:
            embedding, internals = self.network.apply(
                x=states, internals=internals, return_internals=return_internals
            )
        else:
            embedding = self.network.apply(
                x=states, internals=internals, return_internals=return_internals
            )

        Module.update_tensor(name=self.name, tensor=embedding)

        actions = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            action = distribution.sample(parameters=parameters, deterministic=deterministic)

            entropy = distribution.entropy(parameters=parameters)
            entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape'])))
            mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1)
            actions[name] = self.add_summary(
                label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action
            )

        if return_internals:
            return actions, internals
        else:
            return actions
示例#14
0
    def tf_core_observe(self, states, internals, auxiliaries, actions,
                        terminal, reward):
        zero = tf.constant(value=0, dtype=util.tf_dtype(dtype='long'))

        # Experience
        experienced = self.core_experience(states=states,
                                           internals=internals,
                                           auxiliaries=auxiliaries,
                                           actions=actions,
                                           terminal=terminal,
                                           reward=reward)

        # If no periodic update
        if self.update_frequency == 'never':
            return experienced

        # Periodic update
        with tf.control_dependencies(control_inputs=(experienced, )):
            batch_size = self.update_batch_size.value()
            frequency = self.update_frequency.value()
            start = self.update_start.value()

            if self.update_unit == 'timesteps':
                # Timestep-based batch
                one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long'))
                past_horizon = self.policy.dependency_horizon(
                    is_optimization=True)
                if self.baseline_policy is not None:
                    past_horizon = tf.math.maximum(
                        x=past_horizon,
                        y=self.baseline_policy.dependency_horizon(
                            is_optimization=True))
                future_horizon = self.estimator.horizon.value() + one
                start = tf.math.maximum(x=start,
                                        y=(batch_size + past_horizon +
                                           future_horizon))
                timestep = Module.retrieve_tensor(name='timestep')
                timestep = timestep - self.estimator.capacity
                is_frequency = tf.math.equal(x=tf.mod(x=timestep, y=frequency),
                                             y=zero)
                at_least_start = tf.math.greater_equal(x=timestep, y=start)

            elif self.update_unit == 'episodes':
                # Episode-based batch
                start = tf.math.maximum(x=start, y=batch_size)
                episode = Module.retrieve_tensor(name='episode')
                is_frequency = tf.math.equal(x=tf.mod(x=episode, y=frequency),
                                             y=zero)
                # Only update once per episode increment
                terminal = tf.concat(values=((zero, ), terminal), axis=0)
                is_frequency = tf.math.logical_and(x=is_frequency,
                                                   y=(terminal[-1] > zero))
                at_least_start = tf.math.greater_equal(x=episode, y=start)

            is_updated = self.cond(pred=tf.math.logical_and(x=is_frequency,
                                                            y=at_least_start),
                                   true_fn=self.core_update,
                                   false_fn=util.no_operation)

        return is_updated
示例#15
0
    def tf_initialize(self):
        super().tf_initialize()

        if self.unit is None:
            step = None
        elif self.unit == 'timesteps':
            step = Module.retrieve_tensor(name='timestep')
        elif self.unit == 'episodes':
            step = Module.retrieve_tensor(name='episode')
        elif self.unit == 'updates':
            step = Module.retrieve_tensor(name='update')

        default = self.get_parameter_value(step=step)

        # Temporarily leave module variable scope, otherwise placeholder name is unnecessarily long
        if self.device is not None:
            raise TensorforceError.unexpected()

        self.scope.__exit__(None, None, None)

        self.parameter_input = self.add_placeholder(name=self.name,
                                                    dtype=self.dtype,
                                                    shape=self.shape,
                                                    batched=False,
                                                    default=default)

        self.scope.__enter__()
示例#16
0
    def tf_parametrize(self, x):
        one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float'))
        epsilon = tf.constant(value=util.epsilon,
                              dtype=util.tf_dtype(dtype='float'))
        shape = (-1, ) + self.action_spec['shape']

        # Logit
        logit = self.logit.apply(x=x)
        if len(self.embedding_shape) == 1:
            logit = tf.reshape(tensor=logit, shape=shape)

        # States value
        states_value = logit

        # Sigmoid for corresponding probability
        probability = tf.sigmoid(x=logit)

        # Clip probability for numerical stability
        probability = tf.clip_by_value(t=probability,
                                       clip_value_min=epsilon,
                                       clip_value_max=(one - epsilon))

        # "Normalized" logits
        true_logit = tf.math.log(x=probability)
        false_logit = tf.math.log(x=(one - probability))

        Module.update_tensor(name=(self.name + '-probability'),
                             tensor=probability)

        return true_logit, false_logit, probability, states_value
示例#17
0
    def tf_parametrize(self, x):
        one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float'))
        epsilon = tf.constant(value=util.epsilon,
                              dtype=util.tf_dtype(dtype='float'))
        shape = (-1, ) + self.action_spec['shape']

        # Logit
        logit = self.logit.apply(x=x)
        logit = tf.reshape(tensor=logit, shape=shape)

        # States value
        states_value = logit

        # Sigmoid for corresponding probability
        probability = tf.sigmoid(x=logit)

        # Clip probability for numerical stability
        probability = tf.clip_by_value(t=probability,
                                       clip_value_min=epsilon,
                                       clip_value_max=(one - epsilon))

        # "Normalized" logits
        true_logit = tf.log(x=probability)
        false_logit = tf.log(x=(one - probability))

        Module.update_tensor(name=(self.name + '-probability'),
                             tensor=probability)
        true_logit, false_logit, probability, states_value = self.add_summary(
            label=('distributions', 'bernoulli'),
            name='probability',
            tensor=probability,
            pass_tensors=(true_logit, false_logit, probability, states_value))

        return true_logit, false_logit, probability, states_value
示例#18
0
    def tf_sample_actions(self, states, internals, auxiliaries, temperature,
                          return_internals):
        if return_internals:
            embedding, internals = self.network.apply(
                x=states,
                internals=internals,
                return_internals=return_internals)
        else:
            embedding = self.network.apply(x=states,
                                           internals=internals,
                                           return_internals=return_internals)

        Module.update_tensor(name=self.name, tensor=embedding)

        actions = OrderedDict()
        for name, spec, distribution, temp in util.zip_items(
                self.actions_spec, self.distributions, temperature):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            actions[name] = distribution.sample(parameters=parameters,
                                                temperature=temp)

        if return_internals:
            return actions, internals
        else:
            return actions
示例#19
0
    def __init__(self,
                 name,
                 action_spec,
                 embedding_size,
                 infer_states_value=True,
                 summary_labels=None):
        super().__init__(name=name,
                         action_spec=action_spec,
                         embedding_size=embedding_size,
                         summary_labels=summary_labels)
        shape = self.action_spec['shape']
        num_values = self.action_spec['num_values']
        action_size = util.product(xs=shape)
        input_spec = dict(type='float', shape=(self.embedding_size, ))
        self.deviations = self.add_module(name='deviations',
                                          module='linear',
                                          modules=layer_modules,
                                          size=(action_size * num_values),
                                          input_spec=input_spec)
        if infer_states_value:
            self.value = None
        else:
            self.value = self.add_module(name='value',
                                         module='linear',
                                         modules=layer_modules,
                                         size=action_size,
                                         input_spec=input_spec)

        Module.register_tensor(name=(self.name + '-probabilities'),
                               spec=dict(type='float',
                                         shape=(shape + (num_values, ))),
                               batched=True)
示例#20
0
    def __init__(self,
                 name,
                 dtype,
                 unit=None,
                 shape=(),
                 min_value=None,
                 max_value=None,
                 summary_labels=None):
        super().__init__(name=name, summary_labels=summary_labels)

        assert unit in (None, 'timesteps', 'episodes', 'updates')
        self.unit = unit

        spec = dict(type=dtype, shape=shape)
        spec = util.valid_value_spec(value_spec=spec, return_normalized=True)
        self.dtype = spec['type']
        self.shape = spec['shape']

        assert min_value is None or max_value is None or min_value < max_value
        if self.dtype == 'bool':
            if min_value is not None or max_value is not None:
                raise TensorforceError.unexpected()
        elif self.dtype in ('int', 'long'):
            if (min_value is not None and not isinstance(min_value, int)) or \
                    (max_value is not None and not isinstance(max_value, int)):
                raise TensorforceError.unexpected()
        elif self.dtype == 'float':
            if (min_value is not None and not isinstance(min_value, float)) or \
                    (max_value is not None and not isinstance(max_value, float)):
                raise TensorforceError.unexpected()
        else:
            assert False

        assert self.min_value() is None or self.max_value() is None or \
            self.min_value() <= self.max_value()
        if min_value is not None:
            if self.min_value() is None:
                raise TensorforceError.value(name=self.name,
                                             argument='lower bound',
                                             value=self.min_value(),
                                             hint=('not >= ' + str(min_value)))
            elif self.min_value() < min_value:
                raise TensorforceError.value(name=self.name,
                                             argument='lower bound',
                                             value=self.min_value(),
                                             hint=('< ' + str(min_value)))
        if max_value is not None:
            if self.max_value() is None:
                raise TensorforceError.value(name=self.name,
                                             argument='upper bound',
                                             value=self.max_value(),
                                             hint=('not <= ' + str(max_value)))
            elif self.max_value() > max_value:
                raise TensorforceError.value(name=self.name,
                                             argument='upper bound',
                                             value=self.max_value(),
                                             hint=('> ' + str(max_value)))

        Module.register_tensor(name=self.name, spec=spec, batched=False)
示例#21
0
    def tf_loss_per_instance(
        self, states, internals, actions, terminal, reward, next_states, next_internals,
        reference=None
    ):
        # Really state value instead of q value?
        # Michael: doubling this function because NAF needs V'(s) not Q'(s), see comment below
        embedding = self.network.apply(x=states, internals=internals)

        # Both networks can use the same internals, could that be a problem?
        # Otherwise need to handle internals indices correctly everywhere
        target_internals = OrderedDict()
        for name, internal in next_internals.items():
            target_internals['target-' + name] = internal
        Module.update_tensors(**target_internals)
        target_embedding = self.target_network.apply(x=next_states, internals=target_internals)

        deltas = list()
        for name in sorted(self.distributions):
            distribution = self.distributions[name]
            target_distribution = self.target_distributions[name]

            parameters = distribution.parametrize(x=embedding)
            target_parameters = target_distribution.parametrize(x=target_embedding)

            q_value = self.tf_q_value(
                embedding=embedding, parameters=parameters, action=actions[name], name=name
            )

            # Notice, this is V', not Q' because NAF outputs V(s) separately
            next_state_value = target_distribution.states_value(parameters=target_parameters)

            delta = self.tf_q_delta(
                q_value=q_value, next_q_value=next_state_value, terminal=terminal, reward=reward
            )

            collapsed_size = util.product(xs=util.shape(delta)[1:])
            delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size))

            deltas.append(delta)

        # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1)

        # Optional Huber loss
        huber_loss = self.huber_loss.value()

        def no_huber_loss():
            return tf.square(x=loss_per_instance)

        def apply_huber_loss():
            return tf.where(
                condition=(tf.abs(x=loss_per_instance) <= huber_loss),
                x=(0.5 * tf.square(x=loss_per_instance)),
                y=(huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * huber_loss))
            )

        zero = tf.constant(value=0.0, dtype=util.tf_dtype(dtype='float'))
        skip_huber_loss = tf.math.equal(x=huber_loss, y=zero)
        return self.cond(pred=skip_huber_loss, true_fn=no_huber_loss, false_fn=apply_huber_loss)
示例#22
0
    def tf_apply(self, x):

        def no_update():
            return self.moving_mean, self.moving_variance

        def apply_update():
            one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float'))
            axes = tuple(1 + axis for axis in self.axes)

            decay = self.decay.value()
            batch_size = tf.dtypes.cast(x=tf.shape(input=x)[0], dtype=util.tf_dtype(dtype='float'))
            decay = tf.math.pow(x=decay, y=batch_size)

            mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True)
            mean = tf.where(
                condition=self.after_first_call,
                x=(decay * self.moving_mean + (one - decay) * mean), y=mean
            )

            variance = tf.reduce_mean(
                input_tensor=tf.math.squared_difference(x=x, y=mean), axis=axes, keepdims=True
            )
            variance = tf.where(
                condition=self.after_first_call,
                x=(decay * self.moving_variance + (one - decay) * variance), y=variance
            )

            with tf.control_dependencies(control_inputs=(mean, variance)):
                assignment = self.after_first_call.assign(
                    value=tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')),
                    read_value=False
                )

            with tf.control_dependencies(control_inputs=(assignment,)):
                variance = self.moving_variance.assign(value=variance)
                mean = self.moving_mean.assign(value=mean)

            return mean, variance

        optimization = Module.retrieve_tensor(name='optimization')
        update_on_optimization = tf.where(
            condition=self.after_first_call, x=self.update_on_optimization, y=optimization
        )
        update_on_optimization = self.update_on_optimization.assign(value=update_on_optimization)
        skip_update = tf.math.logical_or(
            x=Module.retrieve_tensor(name='independent'),
            y=tf.math.not_equal(x=update_on_optimization, y=optimization)
        )

        mean, variance = self.cond(pred=skip_update, true_fn=no_update, false_fn=apply_update)

        epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float'))
        reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=epsilon))

        x = (x - tf.stop_gradient(input=mean)) * tf.stop_gradient(input=reciprocal_stddev)

        return x
示例#23
0
    def __init__(self, name, dtype, shape=(), summary_labels=None):
        super().__init__(name=name, summary_labels=summary_labels)

        spec = dict(type=dtype, shape=shape)
        spec = util.valid_value_spec(value_spec=spec, return_normalized=True)
        self.dtype = spec['type']
        self.shape = spec['shape']

        Module.register_tensor(name=self.name, spec=spec, batched=False)
示例#24
0
    def get_output_spec(self, input_spec):
        if len(self.tensors) == 1:
            return Module.get_tensor_spec(name=self.tensors[0])

        # Get tensor types and shapes
        dtypes = list()
        shapes = list()
        for tensor in self.tensors:
            # Tensor specification
            if tensor == '*':
                spec = input_spec
            else:
                spec = Module.get_tensor_spec(name=tensor)
            dtypes.append(spec['type'])
            shapes.append(spec['shape'])

        # Check tensor types
        if all(dtype == dtypes[0] for dtype in dtypes):
            dtype = dtypes[0]
        else:
            raise TensorforceError.value(name='tensor types', value=dtypes)

        if self.aggregation == 'concat':
            if any(len(shape) != len(shapes[0]) for shape in shapes):
                raise TensorforceError.value(name='tensor shapes', value=shapes)
            elif any(
                shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape))
                if n != self.axis
            ):
                raise TensorforceError.value(name='tensor shapes', value=shapes)
            shape = tuple(
                sum(shape[n] for shape in shapes) if n == self.axis else shapes[0][n]
                for n in range(len(shapes[0]))
            )

        elif self.aggregation == 'stack':
            if any(len(shape) != len(shapes[0]) for shape in shapes):
                raise TensorforceError.value(name='tensor shapes', value=shapes)
            elif any(shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape))):
                raise TensorforceError.value(name='tensor shapes', value=shapes)
            shape = tuple(
                len(shapes) if n == self.axis else shapes[0][n - int(n > self.axis)]
                for n in range(len(shapes[0]) + 1)
            )

        else:
            # Check and unify tensor shapes
            for shape in shapes:
                if len(shape) != len(shapes[0]):
                    raise TensorforceError.value(name='tensor shapes', value=shapes)
                if any(x != y and x != 1 and y != 1 for x, y in zip(shape, shapes[0])):
                    raise TensorforceError.value(name='tensor shapes', value=shapes)
            shape = tuple(max(shape[n] for shape in shapes) for n in range(len(shapes[0])))

        # Missing num_values, min/max_value!!!
        return dict(type=dtype, shape=shape)
示例#25
0
    def tf_value(self):
        parameter = tf.identity(input=self.parameter_input)

        parameter = self.add_summary(label='parameters', name='value', tensor=parameter)

        # Required for TensorFlow optimizers learning_rate
        if Module.global_tensors is not None:
            Module.update_tensor(name=self.name, tensor=parameter)

        return parameter
示例#26
0
    def tf_optimize_baseline(self, indices):
        # Retrieve states, internals, actions and reward
        dependency_horizon = self.baseline_policy.dependency_horizon(is_optimization=True)
        # horizon change: see timestep-based batch sampling
        starts, lengths, states, internals = self.memory.predecessors(
            indices=indices, horizon=dependency_horizon, sequence_values='states',
            initial_values='internals'
        )
        Module.update_tensors(dependency_starts=starts, dependency_lengths=lengths)
        auxiliaries, actions, reward = self.memory.retrieve(
            indices=indices, values=('auxiliaries', 'actions', 'reward')
        )

        # Reward estimation
        reward = self.estimator.estimate1(
            baseline=self.baseline_policy, memory=self.memory, indices=indices, reward=reward
        )

        # Optimizer arguments
        variables = self.baseline_policy.get_variables(only_trainable=True)
        if self.shared_baseline_network:
            variables += self.policy.network.get_variables(only_trainable=True)

        arguments = dict(
            states=states, internals=internals, auxiliaries=auxiliaries, actions=actions,
            reward=reward
        )

        fn_loss = self.baseline_loss

        def fn_kl_divergence(states, internals, auxiliaries, actions, reward, other=None):
            return self.baseline_policy.kl_divergence(
                states=states, internals=internals, auxiliaries=auxiliaries, other=other
            )

        source_variables = self.policy.get_variables(only_trainable=True)

        if self.global_model is None:
            global_variables = None
        else:
            global_variables = self.global_model.baseline_policy.get_variables(only_trainable=True)

        if self.baseline_objective is None:
            kwargs = dict()
        else:
            kwargs = self.baseline_objective.optimizer_arguments(policy=self.baseline_policy)

        # Optimization
        optimized = self.baseline_optimizer.minimize(
            variables=variables, arguments=arguments, fn_loss=fn_loss,
            fn_kl_divergence=fn_kl_divergence, source_variables=source_variables,
            global_variables=global_variables, **kwargs
        )

        return optimized
示例#27
0
    def tf_core_act(self, states, internals, auxiliaries):
        zero = tf.constant(value=0, dtype=util.tf_dtype(dtype='long'))

        # Dependency horizon
        dependency_horizon = self.policy.dependency_horizon(
            is_optimization=False)
        dependency_horizon = tf.math.maximum(
            x=dependency_horizon,
            y=self.baseline_policy.dependency_horizon(is_optimization=False))

        # TODO: handle arbitrary non-optimization horizons!
        assertion = tf.debugging.assert_equal(x=dependency_horizon, y=zero)
        with tf.control_dependencies(control_inputs=(assertion, )):
            some_state = next(iter(states.values()))
            if util.tf_dtype(dtype='long') in (tf.int32, tf.int64):
                batch_size = tf.shape(input=some_state,
                                      out_type=util.tf_dtype(dtype='long'))[0]
            else:
                batch_size = tf.dtypes.cast(x=tf.shape(input=some_state)[0],
                                            dtype=util.tf_dtype(dtype='long'))
            starts = tf.range(start=batch_size,
                              dtype=util.tf_dtype(dtype='long'))
            lengths = tf.ones(shape=(batch_size, ),
                              dtype=util.tf_dtype(dtype='long'))
            Module.update_tensors(dependency_starts=starts,
                                  dependency_lengths=lengths)

        # Separate baseline internals
        # if self.separate_baseline_internals:
        #     baseline_internals = OrderedDict()
        #     for name in iter(internals):
        #         if name.startswith('baseline-'):
        #             baseline_internals[name] = internals.pop(name)

        # Policy act
        actions, next_internals = self.policy.act(states=states,
                                                  internals=internals,
                                                  auxiliaries=auxiliaries,
                                                  return_internals=True)

        # TODO: entropy etc summaries!

        if any(name not in next_internals for name in internals):
            # Baseline policy act to retrieve next internals
            _, baseline_internals = self.baseline_policy.act(
                states=states,
                internals=internals,
                auxiliaries=auxiliaries,
                return_internals=True)
            assert all(name not in next_internals
                       for name in baseline_internals)
            next_internals.update(baseline_internals)

        return actions, next_internals
示例#28
0
    def __init__(self, name, dtype, shape=(), unit=None, summary_labels=None):
        super().__init__(name=name, summary_labels=summary_labels)

        assert unit in (None, 'timesteps', 'episodes', 'updates')

        spec = dict(type=dtype, shape=shape)
        spec = util.valid_value_spec(value_spec=spec, return_normalized=True)
        self.dtype = spec['type']
        self.shape = spec['shape']
        self.unit = unit

        Module.register_tensor(name=self.name, spec=spec, batched=False)
示例#29
0
    def __init__(self,
                 name,
                 action_spec,
                 embedding_shape,
                 summary_labels=None):
        super().__init__(name=name,
                         action_spec=action_spec,
                         embedding_shape=embedding_shape,
                         summary_labels=summary_labels)

        input_spec = dict(type='float', shape=self.embedding_shape)

        if len(self.embedding_shape) == 1:
            action_size = util.product(xs=self.action_spec['shape'], empty=0)
            self.mean = self.add_module(name='mean',
                                        module='linear',
                                        modules=layer_modules,
                                        size=action_size,
                                        input_spec=input_spec)
            self.log_stddev = self.add_module(name='log-stddev',
                                              module='linear',
                                              modules=layer_modules,
                                              size=action_size,
                                              input_spec=input_spec)

        else:
            if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3:
                raise TensorforceError.unexpected()
            if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]:
                size = self.action_spec['shape'][-1]
            elif self.embedding_shape[:-1] == self.action_spec['shape']:
                size = 0
            else:
                raise TensorforceError.unexpected()
            self.mean = self.add_module(name='mean',
                                        module='linear',
                                        modules=layer_modules,
                                        size=size,
                                        input_spec=input_spec)
            self.log_stddev = self.add_module(name='log-stddev',
                                              module='linear',
                                              modules=layer_modules,
                                              size=size,
                                              input_spec=input_spec)

        Module.register_tensor(name=(self.name + '-mean'),
                               spec=dict(type='float',
                                         shape=self.action_spec['shape']),
                               batched=True)
        Module.register_tensor(name=(self.name + '-stddev'),
                               spec=dict(type='float',
                                         shape=self.action_spec['shape']),
                               batched=True)
示例#30
0
    def tf_core_observe(self, states, internals, auxiliaries, actions, terminal, reward):
        zero = tf.constant(value=0, dtype=util.tf_dtype(dtype='long'))
        one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long'))

        # Experience
        experienced = self.core_experience(
            states=states, internals=internals, auxiliaries=auxiliaries, actions=actions,
            terminal=terminal, reward=reward
        )

        # If no periodic update
        if self.update_frequency is None:
            return experienced

        # Periodic update
        with tf.control_dependencies(control_inputs=(experienced,)):
            batch_size = self.update_batch_size.value()
            frequency = self.update_frequency.value()
            start = self.update_start.value()

            if self.update_unit == 'timesteps':
                # Timestep-based batch
                policy_horizon = self.policy.past_horizon(is_optimization=True)
                baseline_horizon = self.baseline_policy.past_horizon(is_optimization=True) - \
                    self.estimator.future_horizon()
                past_horizon = tf.math.maximum(x=policy_horizon, y=baseline_horizon)
                future_horizon = self.estimator.future_horizon()
                start = tf.math.maximum(
                    x=start, y=(frequency + past_horizon + future_horizon + one)
                )
                unit = Module.retrieve_tensor(name='timestep')

            elif self.update_unit == 'episodes':
                # Episode-based batch
                start = tf.math.maximum(x=start, y=frequency)
                unit = Module.retrieve_tensor(name='episode')

            unit = unit - start
            is_frequency = tf.math.equal(x=tf.math.mod(x=unit, y=frequency), y=zero)
            is_frequency = tf.math.logical_and(x=is_frequency, y=(unit > self.last_update))

            def perform_update():
                assignment = self.last_update.assign(value=unit, read_value=False)
                with tf.control_dependencies(control_inputs=(assignment,)):
                    return self.core_update()

            is_updated = self.cond(
                pred=is_frequency, true_fn=perform_update, false_fn=util.no_operation
            )

        return is_updated