예제 #1
0
        def true_fn():
            reset_values = self.estimator.reset(baseline=self.baseline_policy)

            new_overwritten_values = OrderedDict()
            for name, value1, value2 in util.zip_items(overwritten_values, reset_values):
                if util.is_nested(name=name):
                    new_overwritten_values[name] = OrderedDict()
                    for inner_name, value1, value2 in util.zip_items(value1, value2):
                        new_overwritten_values[name][inner_name] = tf.concat(
                            values=(value1, value2), axis=0
                        )
                else:
                    new_overwritten_values[name] = tf.concat(values=(value1, value2), axis=0)
            return new_overwritten_values
예제 #2
0
    def tf_sample_actions(self, states, internals, auxiliaries, temperature,
                          return_internals):
        if return_internals:
            embedding, internals = self.network.apply(
                x=states,
                internals=internals,
                return_internals=return_internals)
        else:
            embedding = self.network.apply(x=states,
                                           internals=internals,
                                           return_internals=return_internals)

        Module.update_tensor(name=self.name, tensor=embedding)

        actions = OrderedDict()
        for name, spec, distribution, temp in util.zip_items(
                self.actions_spec, self.distributions, temperature):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            actions[name] = distribution.sample(parameters=parameters,
                                                temperature=temp)

        if return_internals:
            return actions, internals
        else:
            return actions
예제 #3
0
 def body(indices, remaining, current_x, current_aggregates):
     current_x = tf.gather(params=x, indices=indices)
     next_x, next_aggregates = self.iterative_step(
         x=current_x, previous=current_aggregates)
     with tf.control_dependencies(control_inputs=(current_x,
                                                  next_x)):
         is_finished = tf.math.equal(x=remaining, y=zeros)
         if isinstance(next_aggregates, dict):
             for name, current_aggregate, next_aggregate in util.zip_items(
                     current_aggregates, next_aggregates):
                 condition = is_finished
                 for _ in range(util.rank(x=current_aggregate) - 1):
                     condition = tf.expand_dims(input=condition,
                                                axis=1)
                 next_aggregates[name] = tf.where(
                     condition=condition,
                     x=current_aggregate,
                     y=next_aggregate)
         else:
             condition = is_finished
             for _ in range(util.rank(x=current_aggregates) - 1):
                 condition = tf.expand_dims(input=condition, axis=1)
             next_aggregates = tf.where(condition=condition,
                                        x=current_aggregates,
                                        y=next_aggregates)
         remaining -= tf.where(condition=is_finished,
                               x=zeros,
                               y=ones)
         indices += tf.where(condition=tf.math.equal(x=remaining,
                                                     y=zeros),
                             x=zeros,
                             y=ones)
     return indices, remaining, next_x, next_aggregates
    def tf_loss_per_instance(
        self, states, internals, actions, terminal, reward, next_states, next_internals,
        reference=None
    ):
        embedding = self.network.apply(x=states, internals=internals)

        log_probs = list()
        for name, distribution, action in util.zip_items(self.distributions, actions):
            parameters = distribution.parametrize(x=embedding)
            log_prob = distribution.log_probability(parameters=parameters, action=action)
            collapsed_size = util.product(xs=util.shape(log_prob)[1:])
            log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size))
            log_probs.append(log_prob)

        log_probs = tf.concat(values=log_probs, axis=1)
        if reference is None:
            old_log_probs = tf.stop_gradient(input=log_probs)
        else:
            old_log_probs = reference

        # Comment on log_ratio 1.0 and gradient perspective
        prob_ratios = tf.exp(x=(log_probs - old_log_probs))
        prob_ratio_per_instance = tf.reduce_mean(input_tensor=prob_ratios, axis=1)

        likelihood_ratio_clipping = self.likelihood_ratio_clipping.value()

        clipped_prob_ratio_per_instance = tf.clip_by_value(
            t=prob_ratio_per_instance,
            clip_value_min=(1.0 / (1.0 + likelihood_ratio_clipping)),
            clip_value_max=(1.0 + likelihood_ratio_clipping)
        )
        return -tf.minimum(
            x=(prob_ratio_per_instance * reward),
            y=(clipped_prob_ratio_per_instance * reward)
        )
예제 #5
0
    def tf_entropy(self,
                   states,
                   internals,
                   auxiliaries,
                   reduced=True,
                   include_per_action=False):
        entropies = self.entropies(states=states,
                                   internals=internals,
                                   auxiliaries=auxiliaries)

        for name, spec, entropy in util.zip_items(self.actions_spec,
                                                  entropies):
            entropies[name] = tf.reshape(
                tensor=entropy, shape=(-1, util.product(xs=spec['shape'])))

        entropy = tf.concat(values=tuple(entropies.values()), axis=1)

        if reduced:
            entropy = tf.math.reduce_mean(input_tensor=entropy, axis=1)
            if include_per_action:
                for name in self.actions_spec:
                    entropies[name] = tf.math.reduce_mean(
                        input_tensor=entropies[name], axis=1)

        if include_per_action:
            entropies['*'] = entropy
            return entropies
        else:
            return entropy
예제 #6
0
    def tf_actions_value(self,
                         states,
                         internals,
                         auxiliaries,
                         actions,
                         reduced=True,
                         include_per_action=False):
        actions_values = self.actions_values(states=states,
                                             internals=internals,
                                             auxiliaries=auxiliaries,
                                             actions=actions)

        for name, spec, actions_value in util.zip_items(
                self.actions_spec, actions_values):
            actions_values[name] = tf.reshape(
                tensor=actions_value,
                shape=(-1, util.product(xs=spec['shape'])))

        actions_value = tf.concat(values=tuple(actions_values.values()),
                                  axis=1)
        if reduced:
            actions_value = tf.math.reduce_mean(input_tensor=actions_value,
                                                axis=1)
            if include_per_action:
                for name in self.actions_spec:
                    actions_values[name] = tf.math.reduce_mean(
                        input_tensor=actions_values[name], axis=1)

        if include_per_action:
            actions_values['*'] = actions_value
            return actions_values
        else:
            return actions_value
예제 #7
0
    def tf_log_probability(self,
                           states,
                           internals,
                           auxiliaries,
                           actions,
                           reduced=True,
                           include_per_action=False):
        log_probabilities = self.log_probabilities(states=states,
                                                   internals=internals,
                                                   auxiliaries=auxiliaries,
                                                   actions=actions)

        for name, spec, log_probability in util.zip_items(
                self.actions_spec, log_probabilities):
            log_probabilities[name] = tf.reshape(
                tensor=log_probability,
                shape=(-1, util.product(xs=spec['shape'])))

        log_probability = tf.concat(values=tuple(log_probabilities.values()),
                                    axis=1)
        if reduced:
            log_probability = tf.math.reduce_sum(input_tensor=log_probability,
                                                 axis=1)

        if include_per_action:
            log_probabilities['*'] = log_probability
            return log_probabilities
        else:
            return log_probability
예제 #8
0
    def tf_kl_divergence(self,
                         states,
                         internals,
                         auxiliaries,
                         other=None,
                         reduced=True,
                         include_per_action=False):
        kl_divergences = self.kl_divergences(states=states,
                                             internals=internals,
                                             auxiliaries=auxiliaries,
                                             other=other)

        for name, spec, kl_divergence in util.zip_items(
                self.actions_spec, kl_divergences):
            kl_divergences[name] = tf.reshape(
                tensor=kl_divergence,
                shape=(-1, util.product(xs=spec['shape'])))

        kl_divergence = tf.concat(values=tuple(kl_divergences.values()),
                                  axis=1)
        if reduced:
            kl_divergence = tf.math.reduce_sum(input_tensor=kl_divergence,
                                               axis=1)

        if include_per_action:
            kl_divergences['*'] = kl_divergence
            return kl_divergences
        else:
            return kl_divergence
예제 #9
0
    def tf_kl_divergences(self, states, internals, auxiliaries, other=None):
        assert other is None or isinstance(other, ParametrizedDistributions)

        embedding = self.network.apply(x=states, internals=internals)
        if other is not None:
            other_embedding = other.network.apply(x=states, internals=internals)

        kl_divergences = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            if other is None:
                other_parameters = tuple(tf.stop_gradient(input=value) for value in parameters)
            elif spec['type'] == 'int':
                other_parameters = other.distributions[name].parametrize(
                    x=other_embedding, mask=mask
                )
            else:
                other_parameters = other.distributions[name].parametrize(x=other_embedding)
            kl_divergences[name] = distribution.kl_divergence(
                parameters1=other_parameters, parameters2=parameters  # order????
            )

        return kl_divergences
예제 #10
0
    def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals):
        if return_internals:
            embedding, internals = self.network.apply(
                x=states, internals=internals, return_internals=return_internals
            )
        else:
            embedding = self.network.apply(
                x=states, internals=internals, return_internals=return_internals
            )

        Module.update_tensor(name=self.name, tensor=embedding)

        actions = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            action = distribution.sample(parameters=parameters, deterministic=deterministic)

            entropy = distribution.entropy(parameters=parameters)
            entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape'])))
            mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1)
            actions[name] = self.add_summary(
                label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action
            )

        if return_internals:
            return actions, internals
        else:
            return actions
예제 #11
0
    def tf_states_values(self, states, internals, auxiliaries):
        embedding = self.network.apply(x=states, internals=internals)
        Module.update_tensor(name=self.name, tensor=embedding)

        states_values = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                parameters = distribution.parametrize(x=embedding, mask=mask)
            else:
                parameters = distribution.parametrize(x=embedding)
            states_values[name] = distribution.states_value(parameters=parameters)

        return states_values
예제 #12
0
    def tf_kldiv_reference(self, states, internals, auxiliaries):
        embedding = self.network.apply(x=states, internals=internals)

        kldiv_reference = OrderedDict()
        for name, spec, distribution in util.zip_items(self.actions_spec,
                                                       self.distributions):
            if spec['type'] == 'int':
                mask = auxiliaries[name + '_mask']
                kldiv_reference[name] = distribution.parametrize(x=embedding,
                                                                 mask=mask)
            else:
                kldiv_reference[name] = distribution.parametrize(x=embedding)

        return kldiv_reference
예제 #13
0
    def tf_act(self, states, internals, auxiliaries):
        actions_values = self.actions_values(states=states,
                                             internals=internals,
                                             auxiliaries=auxiliaries)

        actions = OrderedDict()
        for name, spec, action_values in util.zip_items(
                self.actions_spec, actions_values):
            actions[name] = tf.math.argmax(input=action_values,
                                           axis=-1,
                                           output_type=util.tf_dtype(
                                               spec['type']))

        return actions
    def tf_reference(
        self, states, internals, actions, terminal, reward, next_states, next_internals
    ):
        embedding = self.network.apply(x=states, internals=internals)

        log_probs = list()
        for name, distribution, action in util.zip_items(self.distributions, actions):
            parameters = distribution.parametrize(x=embedding)
            log_prob = distribution.log_probability(parameters=parameters, action=action)
            collapsed_size = util.product(xs=util.shape(log_prob)[1:])
            log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size))
            log_probs.append(log_prob)

        log_probs = tf.concat(values=log_probs, axis=1)
        return tf.stop_gradient(input=log_probs)
예제 #15
0
    def tf_entropy(self, states, internals, auxiliaries, mean=True):
        entropies = self.entropies(states=states,
                                   internals=internals,
                                   auxiliaries=auxiliaries)

        for name, spec, entropy in util.zip_items(self.actions_spec,
                                                  entropies):
            entropies[name] = tf.reshape(
                tensor=entropy, shape=(-1, util.product(xs=spec['shape'])))

        entropy = tf.concat(values=tuple(entropies.values()), axis=1)
        if mean:
            entropy = tf.math.reduce_mean(input_tensor=entropy, axis=1)

        return entropy
예제 #16
0
    def tf_states_values(self, states, internals, auxiliaries):
        if not all(spec['type'] in ('bool', 'int')
                   for spec in self.states_spec.values()):
            raise NotImplementedError

        actions_values = self.actions_values(states=states,
                                             internals=internals,
                                             auxiliaries=auxiliaries)

        states_values = OrderedDict()
        for name, spec, action_values in util.zip_items(
                self.actions_spec, actions_values):
            states_values[name] = tf.math.reduce_max(
                input_tensor=action_values, axis=-1)

        return states_values
예제 #17
0
    def tf_states_value(self, states, internals, auxiliaries, mean=True):
        states_values = self.states_values(states=states,
                                           internals=internals,
                                           auxiliaries=auxiliaries)

        for name, spec, states_value in util.zip_items(self.actions_spec,
                                                       states_values):
            states_values[name] = tf.reshape(
                tensor=states_value,
                shape=(-1, util.product(xs=spec['shape'])))

        states_value = tf.concat(values=tuple(states_values.values()), axis=1)
        if mean:
            states_value = tf.math.reduce_mean(input_tensor=states_value,
                                               axis=1)

        return states_value
예제 #18
0
    def tf_log_probability(self,
                           states,
                           internals,
                           auxiliaries,
                           actions,
                           mean=True):
        log_probabilities = self.log_probabilities(states=states,
                                                   internals=internals,
                                                   auxiliaries=auxiliaries,
                                                   actions=actions)

        for name, spec, log_probability in util.zip_items(
                self.actions_spec, log_probabilities):
            log_probabilities[name] = tf.reshape(
                tensor=log_probability,
                shape=(-1, util.product(xs=spec['shape'])))

        log_probability = tf.concat(values=tuple(log_probabilities.values()),
                                    axis=1)
        if mean:
            log_probability = tf.math.reduce_mean(input_tensor=log_probability,
                                                  axis=1)

        return log_probability
예제 #19
0
    def tf_kl_divergence(self,
                         states,
                         internals,
                         auxiliaries,
                         other=None,
                         mean=True):
        kl_divergences = self.kl_divergences(states=states,
                                             internals=internals,
                                             auxiliaries=auxiliaries,
                                             other=other)

        for name, spec, kl_divergence in util.zip_items(
                self.actions_spec, kl_divergences):
            kl_divergences[name] = tf.reshape(
                tensor=kl_divergence,
                shape=(-1, util.product(xs=spec['shape'])))

        kl_divergence = tf.concat(values=tuple(kl_divergences.values()),
                                  axis=1)
        if mean:
            kl_divergence = tf.math.reduce_mean(input_tensor=kl_divergence,
                                                axis=1)

        return kl_divergence