示例#1
0
class QModel(DistributionModel):
    """
    Q-value model.
    """
    def __init__(self, states_spec, actions_spec, network_spec, config):
        self.target_sync_frequency = config.target_sync_frequency
        self.target_update_weight = config.target_update_weight

        self.double_q_model = config.double_q_model

        assert config.huber_loss is None or config.huber_loss > 0.0
        self.huber_loss = config.huber_loss

        super(QModel, self).__init__(states_spec=states_spec,
                                     actions_spec=actions_spec,
                                     network_spec=network_spec,
                                     config=config)

    def initialize(self, custom_getter):
        super(QModel, self).initialize(custom_getter)

        # Target network
        self.target_network = Network.from_spec(
            spec=self.network_spec,
            kwargs=dict(scope='target', summary_labels=self.summary_labels))

        # Target network optimizer
        self.target_optimizer = Synchronization(
            sync_frequency=self.target_sync_frequency,
            update_weight=self.target_update_weight)

    def tf_q_value(self, embedding, distr_params, action, name):
        # Mainly for NAF.
        return self.distributions[name].state_action_value(
            distr_params=distr_params, action=action)

    def tf_q_delta(self, q_value, next_q_value, terminal, reward):
        """
        Creates the deltas (or advantage) of the Q values.

        :return: A list of deltas per action
        """
        for _ in range(util.rank(q_value) - 1):
            terminal = tf.expand_dims(input=terminal, axis=1)
            reward = tf.expand_dims(input=reward, axis=1)

        multiples = (1, ) + util.shape(q_value)[1:]
        terminal = tf.tile(input=terminal, multiples=multiples)
        reward = tf.tile(input=reward, multiples=multiples)

        zeros = tf.zeros_like(tensor=next_q_value)
        next_q_value = tf.where(condition=terminal,
                                x=zeros,
                                y=(self.discount * next_q_value))

        return reward + next_q_value - q_value  # tf.stop_gradient(q_target)

    def tf_loss_per_instance(self, states, internals, actions, terminal,
                             reward, update):
        embedding = self.network.apply(
            x={name: state[:-1]
               for name, state in states.items()},
            internals=[internal[:-1] for internal in internals],
            update=update)

        # Both networks can use the same internals, could that be a problem?
        # Otherwise need to handle internals indices correctly everywhere
        target_embedding = self.target_network.apply(
            x={name: state[1:]
               for name, state in states.items()},
            internals=[internal[1:] for internal in internals],
            update=update)

        deltas = list()
        for name, distribution in self.distributions.items():

            distr_params = distribution.parameterize(x=embedding)
            target_distr_params = distribution.parameterize(
                x=target_embedding)  # TODO: separate distribution parameters?

            q_value = self.tf_q_value(embedding=embedding,
                                      distr_params=distr_params,
                                      action=actions[name][:-1],
                                      name=name)

            if self.double_q_model:
                action_taken = distribution.sample(distr_params=distr_params,
                                                   deterministic=True)
            else:
                action_taken = distribution.sample(
                    distr_params=target_distr_params, deterministic=True)

            next_q_value = distribution.state_action_value(
                distr_params=target_distr_params, action=action_taken)

            delta = self.tf_q_delta(q_value=q_value,
                                    next_q_value=next_q_value,
                                    terminal=terminal[:-1],
                                    reward=reward[:-1])

            collapsed_size = util.prod(util.shape(delta)[1:])
            delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size))

            deltas.append(delta)

        # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(
            values=deltas, axis=1),
                                           axis=1)

        # Optional Huber loss
        if self.huber_loss is not None and self.huber_loss > 0.0:
            return tf.where(
                condition=(tf.abs(x=loss_per_instance) <= self.huber_loss),
                x=(0.5 * tf.square(x=loss_per_instance)),
                y=(self.huber_loss *
                   (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss)))
        else:
            return tf.square(x=loss_per_instance)

    def tf_optimization(self, states, internals, actions, terminal, reward,
                        update):
        optimization = super(QModel, self).tf_optimization(states=states,
                                                           internals=internals,
                                                           actions=actions,
                                                           terminal=terminal,
                                                           reward=reward,
                                                           update=update)

        target_optimization = self.target_optimizer.minimize(
            time=self.timestep,
            variables=self.target_network.get_variables(),
            source_variables=self.network.get_variables())

        return tf.group(optimization, target_optimization)

    def get_variables(self, include_non_trainable=False):
        model_variables = super(
            QModel,
            self).get_variables(include_non_trainable=include_non_trainable)

        if include_non_trainable:
            # Target network and optimizer variables only included if 'include_non_trainable' set
            target_variables = self.target_network.get_variables(
                include_non_trainable=include_non_trainable)

            target_optimizer_variables = self.target_optimizer.get_variables()

            return model_variables + target_variables + target_optimizer_variables

        else:
            return model_variables

    def get_summaries(self):
        return super(
            QModel,
            self).get_summaries() + self.target_network.get_summaries()
示例#2
0
class QModel(DistributionModel):
    """
    Q-value model.
    """
    def __init__(
            self,
            states_spec,
            actions_spec,
            network_spec,
            device,
            scope,
            saver_spec,
            summary_spec,
            distributed_spec,
            optimizer,
            discount,
            normalize_rewards,
            variable_noise,
            distributions_spec,
            entropy_regularization,
            target_sync_frequency,
            target_update_weight,
            double_q_model,
            huber_loss,
            # TEMP: Random sampling fix
            random_sampling_fix):
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight

        self.double_q_model = double_q_model

        assert huber_loss is None or huber_loss > 0.0
        self.huber_loss = huber_loss

        # TEMP: Random sampling fix
        self.random_sampling_fix = random_sampling_fix

        super(QModel, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            network_spec=network_spec,
            device=device,
            scope=scope,
            saver_spec=saver_spec,
            summary_spec=summary_spec,
            distributed_spec=distributed_spec,
            optimizer=optimizer,
            discount=discount,
            normalize_rewards=normalize_rewards,
            variable_noise=variable_noise,
            distributions_spec=distributions_spec,
            entropy_regularization=entropy_regularization,
        )

    def initialize(self, custom_getter):
        super(QModel, self).initialize(custom_getter)

        # TEMP: Random sampling fix
        if self.random_sampling_fix:
            self.next_state_inputs = dict()
            for name, state in self.states_spec.items():
                self.next_state_inputs[name] = tf.placeholder(
                    dtype=util.tf_dtype(state['type']),
                    shape=(None, ) + tuple(state['shape']),
                    name=('next-' + name))

        # Target network
        self.target_network = Network.from_spec(
            spec=self.network_spec,
            kwargs=dict(scope='target', summary_labels=self.summary_labels))

        # Target network optimizer
        self.target_optimizer = Synchronization(
            sync_frequency=self.target_sync_frequency,
            update_weight=self.target_update_weight)

        # Target network distributions
        self.target_distributions = self.generate_distributions(
            self.actions_spec, self.distributions_spec, self.summary_labels)

    def tf_q_value(self, embedding, distr_params, action, name):
        # Mainly for NAF.
        return self.distributions[name].state_action_value(
            distr_params=distr_params, action=action)

    def tf_q_delta(self, q_value, next_q_value, terminal, reward):
        """
        Creates the deltas (or advantage) of the Q values.

        :return: A list of deltas per action
        """
        for _ in range(util.rank(q_value) - 1):
            terminal = tf.expand_dims(input=terminal, axis=1)
            reward = tf.expand_dims(input=reward, axis=1)

        multiples = (1, ) + util.shape(q_value)[1:]
        terminal = tf.tile(input=terminal, multiples=multiples)
        reward = tf.tile(input=reward, multiples=multiples)

        zeros = tf.zeros_like(tensor=next_q_value)
        next_q_value = tf.where(condition=terminal,
                                x=zeros,
                                y=(self.discount * next_q_value))

        return reward + next_q_value - q_value  # tf.stop_gradient(q_target)

    def tf_loss_per_instance(self, states, internals, actions, terminal,
                             reward, update):
        # TEMP: Random sampling fix
        if self.random_sampling_fix:
            next_states = self.get_states(states=self.next_state_inputs)
            next_states = {
                name: tf.stop_gradient(input=state)
                for name, state in next_states.items()
            }

            embedding, next_internals = self.network.apply(
                x=states,
                internals=internals,
                update=update,
                return_internals=True)

            # Both networks can use the same internals, could that be a problem?
            # Otherwise need to handle internals indices correctly everywhere
            target_embedding = self.target_network.apply(
                x=next_states, internals=next_internals, update=update)

        else:
            embedding = self.network.apply(
                x={name: state[:-1]
                   for name, state in states.items()},
                internals=[internal[:-1] for internal in internals],
                update=update)

            # Both networks can use the same internals, could that be a problem?
            # Otherwise need to handle internals indices correctly everywhere
            target_embedding = self.target_network.apply(
                x={name: state[1:]
                   for name, state in states.items()},
                internals=[internal[1:] for internal in internals],
                update=update)

            actions = {name: action[:-1] for name, action in actions.items()}
            terminal = terminal[:-1]
            reward = reward[:-1]

        deltas = list()
        for name, distribution in self.distributions.items():
            target_distribution = self.target_distributions[name]

            distr_params = distribution.parameterize(x=embedding)
            target_distr_params = target_distribution.parameterize(
                x=target_embedding)

            q_value = self.tf_q_value(embedding=embedding,
                                      distr_params=distr_params,
                                      action=actions[name],
                                      name=name)

            if self.double_q_model:
                action_taken = distribution.sample(distr_params=distr_params,
                                                   deterministic=True)
            else:
                action_taken = target_distribution.sample(
                    distr_params=target_distr_params, deterministic=True)

            next_q_value = target_distribution.state_action_value(
                distr_params=target_distr_params, action=action_taken)

            delta = self.tf_q_delta(q_value=q_value,
                                    next_q_value=next_q_value,
                                    terminal=terminal,
                                    reward=reward)

            collapsed_size = util.prod(util.shape(delta)[1:])
            delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size))

            deltas.append(delta)

        # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(
            values=deltas, axis=1),
                                           axis=1)

        # Optional Huber loss
        if self.huber_loss is not None and self.huber_loss > 0.0:
            return tf.where(
                condition=(tf.abs(x=loss_per_instance) <= self.huber_loss),
                x=(0.5 * tf.square(x=loss_per_instance)),
                y=(self.huber_loss *
                   (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss)))
        else:
            return tf.square(x=loss_per_instance)

    def tf_optimization(self, states, internals, actions, terminal, reward,
                        update):
        optimization = super(QModel, self).tf_optimization(states=states,
                                                           internals=internals,
                                                           actions=actions,
                                                           terminal=terminal,
                                                           reward=reward,
                                                           update=update)

        network_distributions_variables = self.get_distributions_variables(
            self.distributions)
        target_distributions_variables = self.get_distributions_variables(
            self.target_distributions)

        target_optimization = self.target_optimizer.minimize(
            time=self.timestep,
            variables=self.target_network.get_variables() +
            target_distributions_variables,
            source_variables=self.network.get_variables() +
            network_distributions_variables)

        return tf.group(optimization, target_optimization)

    def get_variables(self, include_non_trainable=False):
        model_variables = super(
            QModel,
            self).get_variables(include_non_trainable=include_non_trainable)

        if include_non_trainable:
            # Target network and optimizer variables only included if 'include_non_trainable' set
            target_variables = self.target_network.get_variables(
                include_non_trainable=include_non_trainable)
            target_distributions_variables = self.get_distributions_variables(
                self.target_distributions)
            target_optimizer_variables = self.target_optimizer.get_variables()

            return model_variables + target_variables + target_optimizer_variables + target_distributions_variables

        else:
            return model_variables

    def get_summaries(self):
        target_distributions_summaries = self.get_distributions_summaries(
            self.target_distributions)
        return super(QModel,
                     self).get_summaries() + self.target_network.get_summaries(
                     ) + target_distributions_summaries

    # TEMP: Random sampling fix
    def update(self,
               states,
               internals,
               actions,
               terminal,
               reward,
               return_loss_per_instance=False):
        fetches = [self.optimization]

        # Optionally fetch loss per instance
        if return_loss_per_instance:
            fetches.append(self.loss_per_instance)

        terminal = np.asarray(terminal)
        batched = (terminal.ndim == 1)
        if batched:
            # TEMP: Random sampling fix
            if self.random_sampling_fix:
                feed_dict = {
                    state_input: states[name][0]
                    for name, state_input in self.state_inputs.items()
                }
                feed_dict.update({
                    state_input: states[name][1]
                    for name, state_input in self.next_state_inputs.items()
                })
            else:
                feed_dict = {
                    state_input: states[name]
                    for name, state_input in self.state_inputs.items()
                }
            feed_dict.update({
                internal_input: internals[n]
                for n, internal_input in enumerate(self.internal_inputs)
            })
            feed_dict.update({
                action_input: actions[name]
                for name, action_input in self.action_inputs.items()
            })
            feed_dict[self.terminal_input] = terminal
            feed_dict[self.reward_input] = reward
        else:
            # TEMP: Random sampling fix
            if self.random_sampling_fix:
                raise TensorForceError("Unbatched version not covered by fix.")
            else:
                feed_dict = {
                    state_input: (states[name], )
                    for name, state_input in self.state_inputs.items()
                }
            feed_dict.update({
                internal_input: (internals[n], )
                for n, internal_input in enumerate(self.internal_inputs)
            })
            feed_dict.update({
                action_input: (actions[name], )
                for name, action_input in self.action_inputs.items()
            })
            feed_dict[self.terminal_input] = (terminal, )
            feed_dict[self.reward_input] = (reward, )

        feed_dict[self.deterministic_input] = True
        feed_dict[self.update_input] = True

        fetched = self.monitored_session.run(fetches=fetches,
                                             feed_dict=feed_dict)

        if return_loss_per_instance:
            return fetched[1]
示例#3
0
class DPGTargetModel(DistributionModel):
    """
    Policy gradient model log likelihood model with target network (e.g. DDPG)
    """

    COMPONENT_CRITIC = "critic"
    COMPONENT_TARGET_NETWORK = "target_network"
    COMPONENT_TARGET_DISTRIBUTION = "target_distribution"

    def __init__(self, states, actions, scope, device, saver, summarizer,
                 execution, batching_capacity, variable_noise,
                 states_preprocessing, actions_exploration,
                 reward_preprocessing, update_mode, memory, optimizer,
                 discount, network, distributions, entropy_regularization,
                 critic_network, critic_optimizer, target_sync_frequency,
                 target_update_weight):

        self.critic_network_spec = critic_network
        self.critic_optimizer_spec = critic_optimizer

        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight

        # self.network is the actor, self.critic is the critic
        self.target_network = None
        self.target_network_optimizer = None

        self.critic_network = None
        self.critic_optimizer = None
        self.target_critic_network = None
        self.target_critic_optimizer = None

        super(DPGTargetModel,
              self).__init__(states=states,
                             actions=actions,
                             scope=scope,
                             device=device,
                             saver=saver,
                             summarizer=summarizer,
                             execution=execution,
                             batching_capacity=batching_capacity,
                             variable_noise=variable_noise,
                             states_preprocessing=states_preprocessing,
                             actions_exploration=actions_exploration,
                             reward_preprocessing=reward_preprocessing,
                             update_mode=update_mode,
                             memory=memory,
                             optimizer=optimizer,
                             discount=discount,
                             network=network,
                             distributions=distributions,
                             entropy_regularization=entropy_regularization,
                             requires_deterministic=True)

        assert self.memory_spec["include_next_states"]
        assert self.requires_deterministic

    def setup_components_and_tf_funcs(self, custom_getter=None):
        custom_getter = super(
            DPGTargetModel, self).setup_components_and_tf_funcs(custom_getter)

        # Target network
        self.target_network = Network.from_spec(
            spec=self.network_spec,
            kwargs=dict(scope='target-network',
                        summary_labels=self.summary_labels))

        # Target network optimizer
        self.target_network_optimizer = Synchronization(
            sync_frequency=self.target_sync_frequency,
            update_weight=self.target_update_weight)

        # Target network distributions
        self.target_distributions = self.create_distributions()

        # Critic
        size_t0 = self.critic_network_spec['size_t0']
        size_t1 = self.critic_network_spec['size_t1']

        self.critic_network = DDPGCriticNetwork(scope='critic',
                                                size_t0=size_t0,
                                                size_t1=size_t1)
        self.critic_optimizer = Optimizer.from_spec(
            spec=self.critic_optimizer_spec,
            kwargs=dict(summary_labels=self.summary_labels))

        self.target_critic_network = DDPGCriticNetwork(scope='target-critic',
                                                       size_t0=size_t0,
                                                       size_t1=size_t1)

        # Target critic optimizer
        self.target_critic_optimizer = Synchronization(
            sync_frequency=self.target_sync_frequency,
            update_weight=self.target_update_weight)

        self.fn_target_actions_and_internals = tf.make_template(
            name_='target-actions-and-internals',
            func_=self.tf_target_actions_and_internals,
            custom_getter_=custom_getter)

        self.fn_predict_target_q = tf.make_template(
            name_='predict-target-q',
            func_=self.tf_predict_target_q,
            custom_getter_=custom_getter)
        return custom_getter

    def tf_target_actions_and_internals(self,
                                        states,
                                        internals,
                                        deterministic=True):
        embedding, internals = self.target_network.apply(
            x=states,
            internals=internals,
            update=tf.constant(value=False),
            return_internals=True)

        actions = dict()
        for name in sorted(self.target_distributions):
            distribution = self.target_distributions[name]
            distr_params = distribution.parameterize(x=embedding)
            actions[name] = distribution.sample(
                distr_params=distr_params,
                deterministic=tf.logical_or(x=deterministic,
                                            y=self.requires_deterministic))

        return actions, internals

    def tf_loss_per_instance(self,
                             states,
                             internals,
                             actions,
                             terminal,
                             reward,
                             next_states,
                             next_internals,
                             update,
                             reference=None):
        q = self.critic_network.apply(dict(states=states, actions=actions),
                                      internals=internals,
                                      update=update)
        return -q

    def tf_predict_target_q(self, states, internals, terminal, actions, reward,
                            update):
        q_value = self.target_critic_network.apply(dict(states=states,
                                                        actions=actions),
                                                   internals=internals,
                                                   update=update)
        return reward + (
            1. - tf.cast(terminal, dtype=tf.float32)) * self.discount * q_value

    def tf_optimization(self,
                        states,
                        internals,
                        actions,
                        terminal,
                        reward,
                        next_states=None,
                        next_internals=None):
        update = tf.constant(value=True)

        # Predict actions from target actor
        next_target_actions, next_target_internals = self.fn_target_actions_and_internals(
            states=next_states, internals=next_internals, deterministic=True)

        # Predicted Q value of next states
        predicted_q = self.fn_predict_target_q(states=next_states,
                                               internals=next_internals,
                                               actions=next_target_actions,
                                               terminal=terminal,
                                               reward=reward,
                                               update=update)

        predicted_q = tf.stop_gradient(input=predicted_q)

        real_q = self.critic_network.apply(dict(states=states,
                                                actions=actions),
                                           internals=internals,
                                           update=update)

        # Update critic
        def fn_critic_loss(predicted_q, real_q):
            return tf.reduce_mean(tf.square(real_q - predicted_q))

        critic_optimization = self.critic_optimizer.minimize(
            time=self.timestep,
            variables=self.critic_network.get_variables(),
            arguments=dict(predicted_q=predicted_q, real_q=real_q),
            fn_loss=fn_critic_loss)

        # Update actor
        predicted_actions, predicted_internals = self.fn_actions_and_internals(
            states=states, internals=internals, deterministic=True)

        optimization = super(DPGTargetModel, self).tf_optimization(
            states=states,
            internals=internals,
            actions=predicted_actions,
            terminal=terminal,
            reward=reward,
            next_states=next_states,
            next_internals=next_internals)

        # Update target actor (network) and critic
        network_distributions_variables = [
            variable for name in sorted(self.distributions)
            for variable in self.distributions[name].get_variables(
                include_nontrainable=False)
        ]

        target_distributions_variables = [
            variable for name in sorted(self.target_distributions)
            for variable in self.target_distributions[name].get_variables(
                include_nontrainable=False)
        ]

        target_optimization = self.target_network_optimizer.minimize(
            time=self.timestep,
            variables=self.target_network.get_variables() +
            target_distributions_variables,
            source_variables=self.network.get_variables() +
            network_distributions_variables)

        target_critic_optimization = self.target_critic_optimizer.minimize(
            time=self.timestep,
            variables=self.target_critic_network.get_variables(),
            source_variables=self.critic_network.get_variables())

        return tf.group(critic_optimization, optimization, target_optimization,
                        target_critic_optimization)

    def get_variables(self,
                      include_submodules=False,
                      include_nontrainable=False):
        model_variables = super(DPGTargetModel, self).get_variables(
            include_submodules=include_submodules,
            include_nontrainable=include_nontrainable)
        critic_variables = self.critic_network.get_variables(
            include_nontrainable=include_nontrainable)
        model_variables += critic_variables

        if include_nontrainable:
            critic_optimizer_variables = self.critic_optimizer.get_variables()

            for variable in critic_optimizer_variables:
                if variable in model_variables:
                    model_variables.remove(variable)

            model_variables += critic_optimizer_variables

        if include_submodules:
            target_variables = self.target_network.get_variables(
                include_nontrainable=include_nontrainable)
            model_variables += target_variables

            target_distributions_variables = [
                variable for name in sorted(self.target_distributions)
                for variable in self.target_distributions[name].get_variables(
                    include_nontrainable=include_nontrainable)
            ]
            model_variables += target_distributions_variables

            target_critic_variables = self.target_critic_network.get_variables(
                include_nontrainable=include_nontrainable)
            model_variables += target_critic_variables

            if include_nontrainable:
                target_optimizer_variables = self.target_network_optimizer.get_variables(
                )
                model_variables += target_optimizer_variables

                target_critic_optimizer_variables = self.target_critic_optimizer.get_variables(
                )
                model_variables += target_critic_optimizer_variables

        return model_variables

    def get_components(self):
        result = dict(super(DPGTargetModel, self).get_components())
        result[DPGTargetModel.COMPONENT_CRITIC] = self.critic_network
        result[DPGTargetModel.COMPONENT_TARGET_NETWORK] = self.target_network
        for name in sorted(self.target_distributions):
            result["%s_%s" % (DPGTargetModel.COMPONENT_TARGET_DISTRIBUTION,
                              name)] = self.target_distributions[name]
        if len(self.target_distributions) == 1:
            result[DPGTargetModel.
                   COMPONENT_TARGET_DISTRIBUTION] = self.target_distributions[
                       next(iter(sorted(self.target_distributions)))]
        return result