Пример #1
0
    def initialize(self, custom_getter):
        super(QDemoModel, self).initialize(custom_getter=custom_getter)

        self.demo_memory = Replay(states=self.states_spec,
                                  internals=self.internals_spec,
                                  actions=self.actions_spec,
                                  include_next_states=True,
                                  capacity=self.demo_memory_capacity,
                                  scope='demo-replay',
                                  summary_labels=self.summary_labels)

        # Import demonstration optimization.
        self.fn_import_demo_experience = tf.make_template(
            name_='import-demo-experience',
            func_=self.tf_import_demo_experience,
            custom_getter_=custom_getter)

        # Demonstration loss.
        self.fn_demo_loss = tf.make_template(name_='demo-loss',
                                             func_=self.tf_demo_loss,
                                             custom_getter_=custom_getter)

        # Combined loss.
        self.fn_combined_loss = tf.make_template(name_='combined-loss',
                                                 func_=self.tf_combined_loss,
                                                 custom_getter_=custom_getter)

        # Demonstration optimization.
        self.fn_demo_optimization = tf.make_template(
            name_='demo-optimization',
            func_=self.tf_demo_optimization,
            custom_getter_=custom_getter)
Пример #2
0
    def __init__(self, states_spec, actions_spec, network_spec, config):
        self.network_spec = network_spec
        config = config.copy()
        config.default(DQFDAgent.default_config)

        # DQFD always uses double dqn, which is a required key for a q-model.
        config.obligatory(double_dqn=True)
        self.target_update_frequency = config.target_update_frequency
        self.demo_memory_capacity = config.demo_memory_capacity

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec)

        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            config=config
        )
Пример #3
0
 def __init__(self, config):
     config.default(PPOModel.default_config)
     super(PPOModel, self).__init__(config)
     self.epochs = config.epochs
     self.optimizer_batch_size = config.optimizer_batch_size
     # Use replay memory so memory logic can be used to sample batches
     self.memory = Replay(config.batch_size, config.states, config.actions,
                          config.random_sampling)
Пример #4
0
    def __init__(self, config, model=None):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config, model)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format(self.demo_batch_size)
Пример #5
0
 def __init__(self, config):
     config.default(PPOModel.default_config)
     super(PPOModel, self).__init__(config)
     self.optimizer_batch_size = config.optimizer_batch_size
     self.batch_size = config.batch_size
     self.updates = int(
         config.batch_size / self.optimizer_batch_size) * config.epochs
     if self.batch_size % self.optimizer_batch_size != 0:
         raise TensorForceError(
             'batch_size must be a multiple of optimizer_batch_size')
     # Use replay memory as a cache so it can be used to sample minibatches
     self.memory = Replay(config.batch_size, config.states, config.actions,
                          config.random_sampling)
Пример #6
0
    def __init__(self, config):
        config.default(PPOModel.default_config)
        super(PPOModel, self).__init__(config)
        self.optimizer_batch_size = config.optimizer_batch_size
        # Use replay memory so memory logic can be used to sample batches

        if self.optimizer_batch_size > config.batch_size:
            raise Exception(
                "optimizer_batch_size > batch_size ({}, {})".format(
                    self.optimizer_batch_size, config.batch_size))
        self.updates = int(
            config.batch_size / self.optimizer_batch_size) * config.epochs
        self.memory = Replay(config.batch_size, config.states, config.actions,
                             config.random_sampling)
Пример #7
0
    def __init__(self, config):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states,
                                  config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio *
                                   config.batch_size /
                                   (1.0 - config.demo_sampling_ratio))
Пример #8
0
    def setup_components_and_tf_funcs(self, custom_getter=None):
        """
        Constructs the extra Replay memory.
        """
        custom_getter = super(QDemoModel, self).setup_components_and_tf_funcs(custom_getter)

        self.demo_memory = Replay(
            states=self.states_spec,
            internals=self.internals_spec,
            actions=self.actions_spec,
            include_next_states=True,
            capacity=self.demo_memory_capacity,
            scope='demo-replay',
            summary_labels=self.summary_labels
        )

        # Import demonstration optimization.
        self.fn_import_demo_experience = tf.make_template(
            name_='import-demo-experience',
            func_=self.tf_import_demo_experience,
            custom_getter_=custom_getter
        )

        # Demonstration loss.
        self.fn_demo_loss = tf.make_template(
            name_='demo-loss',
            func_=self.tf_demo_loss,
            custom_getter_=custom_getter
        )

        # Combined loss.
        self.fn_combined_loss = tf.make_template(
            name_='combined-loss',
            func_=self.tf_combined_loss,
            custom_getter_=custom_getter
        )

        # Demonstration optimization.
        self.fn_demo_optimization = tf.make_template(
            name_='demo-optimization',
            func_=self.tf_demo_optimization,
            custom_getter_=custom_getter
        )

        return custom_getter
Пример #9
0
    def __init__(self, config):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive.' \
                                         '(Calculated {} based on current parameters)'.format(self.demo_batch_size)
Пример #10
0
class PPOModel(PolicyGradientModel):
    allows_discrete_actions = True
    allows_continuous_actions = True

    default_config = dict(
        entropy_penalty=0.01,
        loss_clipping=0.1,  # Trust region clipping
        epochs=10,  # Number of training epochs for SGD,
        optimizer_batch_size=128,  # Batch size for optimiser
        random_sampling=True  # Sampling strategy for replay memory
    )

    def __init__(self, config):
        config.default(PPOModel.default_config)
        super(PPOModel, self).__init__(config)
        self.optimizer_batch_size = config.optimizer_batch_size
        # Use replay memory so memory logic can be used to sample batches

        self.updates = int(
            config.batch_size / self.optimizer_batch_size) * config.epochs
        self.memory = Replay(config.batch_size, config.states, config.actions,
                             config.random_sampling)

    def create_tf_operations(self, config):
        """
        Creates PPO training operations, i.e. the SGD update
        based on the trust region loss.
        :return:
        """
        super(PPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            prob_ratios = list()
            entropy_penalties = list()
            kl_divergences = list()
            entropies = list()

            for name, action in self.action.items():
                distribution = self.distribution[name]
                prev_distribution = tuple(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(x, unknown=None))
                    for x in distribution)
                self.internal_inputs.extend(prev_distribution)
                self.internal_outputs.extend(distribution)
                self.internal_inits.extend(
                    np.zeros(shape=util.shape(x)[1:]) for x in distribution)
                prev_distribution = self.distribution[
                    name].__class__.from_tensors(
                        parameters=prev_distribution,
                        deterministic=self.deterministic)

                shape_size = util.prod(config.actions[name].shape)

                # Standard policy gradient log likelihood computation
                log_prob = distribution.log_probability(action=action)
                prev_log_prob = prev_distribution.log_probability(
                    action=action)
                log_prob_diff = tf.minimum(x=(log_prob - prev_log_prob),
                                           y=10.0)
                prob_ratio = tf.exp(x=log_prob_diff)
                prob_ratio = tf.reshape(tensor=prob_ratio,
                                        shape=(-1, shape_size))
                prob_ratios.append(prob_ratio)

                entropy = distribution.entropy()

                entropy_penalty = -config.entropy_penalty * entropy
                entropy_penalty = tf.reshape(tensor=entropy_penalty,
                                             shape=(-1, shape_size))
                entropy_penalties.append(entropy_penalty)

                entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
                entropies.append(entropy)

                kl_divergence = distribution.kl_divergence(prev_distribution)
                kl_divergence = tf.reshape(tensor=kl_divergence,
                                           shape=(-1, shape_size))
                kl_divergences.append(kl_divergence)

            # The surrogate loss in PPO is the minimum of clipped loss and
            # target advantage * prob_ratio, which is the CPO loss
            # Presentation on conservative policy iteration:
            # https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps
            prob_ratio = tf.reduce_mean(input_tensor=tf.concat(
                values=prob_ratios, axis=1),
                                        axis=1)
            prob_ratio = tf.clip_by_value(prob_ratio,
                                          1.0 - config.loss_clipping,
                                          1.0 + config.loss_clipping)
            self.loss_per_instance = -prob_ratio * self.reward
            self.surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance, axis=0)
            tf.losses.add_loss(self.surrogate_loss)

            # Mean over actions, mean over batch
            entropy_penalty = tf.reduce_mean(input_tensor=tf.concat(
                values=entropy_penalties, axis=1),
                                             axis=1)
            self.entropy_penalty = tf.reduce_mean(input_tensor=entropy_penalty,
                                                  axis=0)
            tf.losses.add_loss(self.entropy_penalty)

            entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies,
                                                            axis=1),
                                     axis=1)
            self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0)

            kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=kl_divergences, axis=1),
                                           axis=1)
            self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence,
                                                axis=0)

    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the trust region update based on SGD on the clipped loss.

        :param batch: On policy batch of experiences.
        :return:
        """

        # Compute GAE.
        self.advantage_estimation(batch)

        if self.baseline:
            self.baseline.update(states=batch['states'],
                                 returns=batch['returns'])

        # Set memory contents to batch contents
        self.memory.set_memory(states=batch['states'],
                               actions=batch['actions'],
                               rewards=batch['rewards'],
                               terminals=batch['terminals'],
                               internals=batch['internals'])

        # PPO takes multiple passes over the on-policy batch.
        # We use a memory sampling random ranges (as opposed to keeping
        # track of indices and e.g. first taking elems 0-15, then 16-32, etc).
        for i in xrange(self.updates):
            self.logger.debug('Optimising PPO, update = {}'.format(i))
            batch = self.memory.get_batch(self.optimizer_batch_size)

            fetches = [
                self.optimize, self.loss, self.loss_per_instance,
                self.kl_divergence, self.entropy
            ]

            feed_dict = {
                state: batch['states'][name]
                for name, state in self.state.items()
            }
            feed_dict.update({
                action: batch['actions'][name]
                for name, action in self.action.items()
            })
            feed_dict[self.reward] = batch['rewards']
            feed_dict[self.terminal] = batch['terminals']
            feed_dict.update({
                internal: batch['internals'][n]
                for n, internal in enumerate(self.internal_inputs)
            })

            loss, loss_per_instance, kl_divergence, entropy = self.session.run(
                fetches=fetches, feed_dict=feed_dict)[1:5]

            self.logger.debug('Loss = {}'.format(loss))
            self.logger.debug('KL divergence = {}'.format(kl_divergence))
            self.logger.debug('Entropy = {}'.format(entropy))

        return loss, loss_per_instance
Пример #11
0
    def __init__(
            self,
            states_spec,
            actions_spec,
            batched_observe=1000,
            scope='dqfd',
            # parameters specific to LearningAgents
            summary_spec=None,
            network_spec=None,
            device=None,
            session_config=None,
            saver_spec=None,
            distributed_spec=None,
            optimizer=None,
            discount=0.99,
            variable_noise=None,
            states_preprocessing_spec=None,
            explorations_spec=None,
            reward_preprocessing_spec=None,
            distributions_spec=None,
            entropy_regularization=None,
            # parameters specific to MemoryAgents
            batch_size=32,
            memory=None,
            first_update=10000,
            update_frequency=4,
            repeat_update=1,
            # parameters specific to DQFD agents
            target_sync_frequency=10000,
            target_update_weight=1.0,
            huber_loss=None,
            expert_margin=0.5,
            supervised_weight=0.1,
            demo_memory_capacity=10000,
            demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=batched_observe,
            scope=scope,
            # parameters specific to LearningAgent
            summary_spec=summary_spec,
            network_spec=network_spec,
            discount=discount,
            device=device,
            session_config=session_config,
            saver_spec=saver_spec,
            distributed_spec=distributed_spec,
            optimizer=optimizer,
            variable_noise=variable_noise,
            states_preprocessing_spec=states_preprocessing_spec,
            explorations_spec=explorations_spec,
            reward_preprocessing_spec=reward_preprocessing_spec,
            distributions_spec=distributions_spec,
            entropy_regularization=entropy_regularization,
            # parameters specific to MemoryAgents
            batch_size=batch_size,
            memory=memory,
            first_update=first_update,
            update_frequency=update_frequency,
            repeat_update=repeat_update)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_memory_capacity = demo_memory_capacity
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)
Пример #12
0
class QDemoModel(QModel):
    """
    Model for deep Q-learning from demonstration. Principal structure similar to double
    deep Q-networks but uses additional loss terms for demo data.
    """
    def __init__(self, states, actions, scope, device, saver, summarizer,
                 execution, batching_capacity, variable_noise,
                 states_preprocessing, actions_exploration,
                 reward_preprocessing, update_mode, memory, optimizer,
                 discount, network, distributions, entropy_regularization,
                 target_sync_frequency, target_update_weight, double_q_model,
                 huber_loss, expert_margin, supervised_weight,
                 demo_memory_capacity, demo_batch_size):
        if any(action['type'] not in ('bool', 'int')
               for action in actions.values()):
            raise TensorForceError(
                "Invalid action type, only 'bool' and 'int' are valid!")

        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight
        self.demo_memory_capacity = demo_memory_capacity
        self.demo_batch_size = demo_batch_size

        super(QDemoModel,
              self).__init__(states=states,
                             actions=actions,
                             scope=scope,
                             device=device,
                             saver=saver,
                             summarizer=summarizer,
                             execution=execution,
                             batching_capacity=batching_capacity,
                             variable_noise=variable_noise,
                             states_preprocessing=states_preprocessing,
                             actions_exploration=actions_exploration,
                             reward_preprocessing=reward_preprocessing,
                             update_mode=update_mode,
                             memory=memory,
                             optimizer=optimizer,
                             discount=discount,
                             network=network,
                             distributions=distributions,
                             entropy_regularization=entropy_regularization,
                             target_sync_frequency=target_sync_frequency,
                             target_update_weight=target_update_weight,
                             double_q_model=double_q_model,
                             huber_loss=huber_loss)

    def initialize(self, custom_getter):
        super(QDemoModel, self).initialize(custom_getter=custom_getter)

        self.demo_memory = Replay(states=self.states_spec,
                                  internals=self.internals_spec,
                                  actions=self.actions_spec,
                                  include_next_states=True,
                                  capacity=self.demo_memory_capacity,
                                  scope='demo-replay',
                                  summary_labels=self.summary_labels)

        # Import demonstration optimization.
        self.fn_import_demo_experience = tf.make_template(
            name_='import-demo-experience',
            func_=self.tf_import_demo_experience,
            custom_getter_=custom_getter)

        # Demonstration loss.
        self.fn_demo_loss = tf.make_template(name_='demo-loss',
                                             func_=self.tf_demo_loss,
                                             custom_getter_=custom_getter)

        # Combined loss.
        self.fn_combined_loss = tf.make_template(name_='combined-loss',
                                                 func_=self.tf_combined_loss,
                                                 custom_getter_=custom_getter)

        # Demonstration optimization.
        self.fn_demo_optimization = tf.make_template(
            name_='demo-optimization',
            func_=self.tf_demo_optimization,
            custom_getter_=custom_getter)

    def tf_initialize(self):
        super(QDemoModel, self).tf_initialize()
        self.demo_memory.initialize()

    def tf_import_demo_experience(self, states, internals, actions, terminal,
                                  reward):
        """
        Imports a single experience to memory.
        """
        return self.demo_memory.store(states=states,
                                      internals=internals,
                                      actions=actions,
                                      terminal=terminal,
                                      reward=reward)

    def tf_demo_loss(self,
                     states,
                     actions,
                     terminal,
                     reward,
                     internals,
                     update,
                     reference=None):
        """
        Extends the q-model loss via the dqfd large-margin loss.
        """
        embedding = self.network.apply(x=states,
                                       internals=internals,
                                       update=update)
        deltas = list()

        for name, action in actions.items():
            distr_params = self.distributions[name].parameterize(x=embedding)
            state_action_value = self.distributions[name].state_action_value(
                distr_params=distr_params, action=action)

            # Create the supervised margin loss
            # Zero for the action taken, one for all other actions, now multiply by expert margin
            if self.actions_spec[name]['type'] == 'bool':
                num_actions = 2
                action = tf.cast(x=action, dtype=util.tf_dtype('int'))
            else:
                num_actions = self.actions_spec[name]['num_actions']

            one_hot = tf.one_hot(indices=action, depth=num_actions)
            ones = tf.ones_like(tensor=one_hot, dtype=tf.float32)
            inverted_one_hot = ones - one_hot

            # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others
            state_action_values = self.distributions[name].state_action_value(
                distr_params=distr_params)
            state_action_values = state_action_values + inverted_one_hot * self.expert_margin
            supervised_selector = tf.reduce_max(
                input_tensor=state_action_values, axis=-1)

            # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E)
            delta = supervised_selector - state_action_value

            action_size = util.prod(self.actions_spec[name]['shape'])
            delta = tf.reshape(tensor=delta, shape=(-1, action_size))
            deltas.append(delta)

        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(
            values=deltas, axis=1),
                                           axis=1)
        loss_per_instance = tf.square(x=loss_per_instance)

        return tf.reduce_mean(input_tensor=loss_per_instance, axis=0)

    def tf_combined_loss(self,
                         states,
                         internals,
                         actions,
                         terminal,
                         reward,
                         next_states,
                         next_internals,
                         update,
                         reference=None):
        """
        Combines Q-loss and demo loss.
        """
        q_model_loss = self.fn_loss(states=states,
                                    internals=internals,
                                    actions=actions,
                                    terminal=terminal,
                                    reward=reward,
                                    next_states=next_states,
                                    next_internals=next_internals,
                                    update=update,
                                    reference=reference)

        demo_loss = self.fn_demo_loss(states=states,
                                      internals=internals,
                                      actions=actions,
                                      terminal=terminal,
                                      reward=reward,
                                      update=update,
                                      reference=reference)

        return q_model_loss + self.supervised_weight * demo_loss

    def tf_demo_optimization(self, states, internals, actions, terminal,
                             reward, next_states, next_internals):
        arguments = dict(time=self.global_timestep,
                         variables=self.get_variables(),
                         arguments=dict(states=states,
                                        internals=internals,
                                        actions=actions,
                                        terminal=terminal,
                                        reward=reward,
                                        next_states=next_states,
                                        next_internals=next_internals,
                                        update=tf.constant(value=True)),
                         fn_loss=self.fn_combined_loss)
        demo_optimization = self.optimizer.minimize(**arguments)

        arguments = self.target_optimizer_arguments()
        target_optimization = self.target_optimizer.minimize(**arguments)

        return tf.group(demo_optimization, target_optimization)

    def tf_optimization(self,
                        states,
                        internals,
                        actions,
                        terminal,
                        reward,
                        next_states=None,
                        next_internals=None):
        optimization = super(QDemoModel, self).tf_optimization(
            states=states,
            internals=internals,
            actions=actions,
            reward=reward,
            terminal=terminal,
            next_states=next_states,
            next_internals=next_internals)

        demo_batch = self.demo_memory.retrieve_timesteps(
            n=self.demo_batch_size)
        demo_optimization = self.fn_demo_optimization(**demo_batch)

        return tf.group(optimization, demo_optimization)

    def create_operations(self, states, internals, actions, terminal, reward,
                          deterministic, independent):
        # Import demo experience operation.
        self.import_demo_experience_output = self.fn_import_demo_experience(
            states=states,
            internals=internals,
            actions=actions,
            terminal=terminal,
            reward=reward)

        # !!!
        super(QDemoModel, self).create_operations(states=states,
                                                  internals=internals,
                                                  actions=actions,
                                                  terminal=terminal,
                                                  reward=reward,
                                                  deterministic=deterministic,
                                                  independent=independent)

        # Demo optimization operation.
        demo_batch = self.demo_memory.retrieve_timesteps(
            n=self.demo_batch_size)
        self.demo_optimization_output = self.fn_demo_optimization(**demo_batch)

    def get_variables(self,
                      include_submodules=False,
                      include_nontrainable=False):
        """
        Returns the TensorFlow variables used by the model.

        Returns:
            List of variables.
        """
        model_variables = super(QDemoModel, self).get_variables(
            include_submodules=include_submodules,
            include_nontrainable=include_nontrainable)

        if include_nontrainable:
            demo_memory_variables = self.demo_memory.get_variables()
            model_variables += demo_memory_variables

        return model_variables

    def get_summaries(self):
        model_summaries = super(QDemoModel, self).get_summaries()
        demo_memory_summaries = self.demo_memory.get_summaries()

        return model_summaries + demo_memory_summaries

    def import_demo_experience(self, states, internals, actions, terminal,
                               reward):
        """
        Stores demonstrations in the demo memory.
        """
        fetches = self.import_demo_experience_output

        feed_dict = self.get_feed_dict(states=states,
                                       internals=internals,
                                       actions=actions,
                                       terminal=terminal,
                                       reward=reward)

        self.monitored_session.run(fetches=fetches, feed_dict=feed_dict)

    def demo_update(self):
        """
        Performs a demonstration update by calling the demo optimization operation.
        Note that the batch data does not have to be fetched from the demo memory as this is now part of
        the TensorFlow operation of the demo update.
        """
        fetches = self.demo_optimization_output

        self.monitored_session.run(fetches=fetches)
Пример #13
0
class PPOModel(PolicyGradientModel):
    allows_discrete_actions = True
    allows_continuous_actions = True

    default_config = dict(
        entropy_penalty=0.01,
        loss_clipping=0.2,  # Trust region clipping
        epochs=10,  # Number of training epochs for SGD,
        optimizer_batch_size=128,  # Batch size for optimiser
        random_sampling=True  # Sampling strategy for replay memory
    )

    def __init__(self, config):
        config.default(PPOModel.default_config)
        super(PPOModel, self).__init__(config)
        self.optimizer_batch_size = config.optimizer_batch_size
        # Use replay memory so memory logic can be used to sample batches

        if self.optimizer_batch_size > config.batch_size:
            raise Exception(
                "optimizer_batch_size > batch_size ({}, {})".format(
                    self.optimizer_batch_size, config.batch_size))
        self.updates = int(
            config.batch_size / self.optimizer_batch_size) * config.epochs
        self.memory = Replay(config.batch_size, config.states, config.actions,
                             config.random_sampling)

    def create_tf_operations(self, config):
        """
        Creates PPO training operations, i.e. the SGD update
        based on the trust region loss.
        :return:
        """
        super(PPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            prob_ratios = list()
            entropy_penalties = list()

            # for diagnostics
            kl_divergences = list()
            entropies = list()
            self.distribution_tensors = dict()
            self.prev_distribution_tensors = dict()

            for name, action in self.action.items():
                shape_size = util.prod(config.actions[name].shape)
                distribution = self.distribution[name]
                fixed_distribution = distribution.__class__.from_tensors(
                    tensors=[
                        tf.stop_gradient(x)
                        for x in distribution.get_tensors()
                    ],
                    deterministic=self.deterministic)

                # Standard policy gradient log likelihood computation
                log_prob = distribution.log_probability(action=action)
                fixed_log_prob = fixed_distribution.log_probability(
                    action=action)
                log_prob_diff = log_prob - fixed_log_prob
                prob_ratio = tf.exp(x=log_prob_diff)
                prob_ratio = tf.reshape(tensor=prob_ratio,
                                        shape=(-1, shape_size))
                prob_ratios.append(prob_ratio)

                entropy = distribution.entropy()
                entropy_penalty = -config.entropy_penalty * entropy
                entropy_penalty = tf.reshape(tensor=entropy_penalty,
                                             shape=(-1, shape_size))
                entropy_penalties.append(entropy_penalty)

                self.distribution_tensors[name] = list(
                    distribution.get_tensors())
                prev_distribution = list(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(tensor, unknown=None))
                    for tensor in distribution.get_tensors())
                self.prev_distribution_tensors[name] = prev_distribution
                prev_distribution = distribution.from_tensors(
                    tensors=prev_distribution,
                    deterministic=self.deterministic)

                kl_divergence = prev_distribution.kl_divergence(
                    other=distribution)
                kl_divergence = tf.reshape(tensor=kl_divergence,
                                           shape=(-1, shape_size))
                kl_divergences.append(kl_divergence)

                entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
                entropies.append(entropy)

            # The surrogate loss in PPO is the minimum of clipped loss and
            # target advantage * prob_ratio, which is the CPO loss
            # Presentation on conservative policy iteration:
            # https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps
            prob_ratio = tf.reduce_mean(input_tensor=tf.concat(
                values=prob_ratios, axis=1),
                                        axis=1)
            tf.summary.histogram('prob_ratio', prob_ratio)
            tf.summary.scalar('mean_prob_ratio',
                              tf.reduce_mean(input_tensor=prob_ratio, axis=0))

            clipped_prob_ratio = tf.clip_by_value(prob_ratio,
                                                  1.0 - config.loss_clipping,
                                                  1.0 + config.loss_clipping)
            self.loss_per_instance = -tf.minimum(
                x=(prob_ratio * self.reward),
                y=(clipped_prob_ratio * self.reward))
            self.surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance,
                axis=0,
                name='surrogate_loss')
            tf.losses.add_loss(self.surrogate_loss)

            # Mean over actions, mean over batch
            entropy_penalty = tf.reduce_mean(input_tensor=tf.concat(
                values=entropy_penalties, axis=1),
                                             axis=1)
            self.entropy_penalty = tf.reduce_mean(input_tensor=entropy_penalty,
                                                  axis=0,
                                                  name='entropy_penalty')
            tf.losses.add_loss(self.entropy_penalty)

            kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=kl_divergences, axis=1),
                                           axis=1)
            self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence,
                                                axis=0)
            tf.summary.scalar('kl_divergence', self.kl_divergence)

            entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies,
                                                            axis=1),
                                     axis=1)
            self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0)
            tf.summary.scalar('entropy', self.entropy)

    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the trust region update based on SGD on the clipped loss.

        :param batch: On policy batch of experiences.
        :return:
        """

        batch['rewards'], discounted_rewards = self.reward_estimation(
            states=batch['states'],
            rewards=batch['rewards'],
            terminals=batch['terminals'])
        if self.baseline:
            self.baseline.update(states=batch['states'],
                                 returns=discounted_rewards)

        # Set memory contents to batch contents
        self.memory.set_memory(states=batch['states'],
                               actions=batch['actions'],
                               rewards=batch['rewards'],
                               terminals=batch['terminals'],
                               internals=batch['internals'])

        # PPO takes multiple passes over the on-policy batch.
        # We use a memory sampling random ranges (as opposed to keeping
        # track of indices and e.g. first taking elems 0-15, then 16-32, etc).
        for i in xrange(self.updates):
            self.logger.debug('Optimising PPO, update = {}'.format(i))
            batch = self.memory.get_batch(self.optimizer_batch_size)

            feed_dict = {
                state: batch['states'][name]
                for name, state in self.state.items()
            }
            feed_dict.update({
                action: batch['actions'][name]
                for name, action in self.action.items()
            })
            feed_dict[self.reward] = batch['rewards']
            feed_dict[self.terminal] = batch['terminals']
            feed_dict.update({
                internal: batch['internals'][n]
                for n, internal in enumerate(self.internal_inputs)
            })

            if i == 0:  # First update, fetch previous distribution tensors
                assert self.updates >= 2
                assert 'optimize' not in self.distribution_tensors
                fetches = dict(optimize=self.optimize)
                fetches.update(self.distribution_tensors)
                prev_distribution_tensors = self.session.run(
                    fetches=fetches, feed_dict=feed_dict)
                prev_distribution_tensors.pop('optimize')

            elif i == self.updates - 1:  # Last update, fetch return and diagnostics values
                fetches = [
                    self.optimize, self.loss, self.loss_per_instance,
                    self.kl_divergence, self.entropy
                ]
                prev_distribution_tensors = {
                    placeholder: tensor
                    for name, placeholders in
                    self.prev_distribution_tensors.items()
                    for placeholder, tensor in zip(
                        placeholders, prev_distribution_tensors[name])
                }
                feed_dict.update(prev_distribution_tensors)
                with SummarySessionWrapper(self, fetches,
                                           feed_dict) as session:
                    _, loss, loss_per_instance, kl_divergence, entropy = session.run(
                    )

            else:  # Otherwise just optimize
                self.session.run(fetches=self.optimize, feed_dict=feed_dict)

        return loss, loss_per_instance
Пример #14
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 scope='dqfd',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 optimizer=None,
                 discount=0.99,
                 normalize_rewards=False,
                 variable_noise=None,
                 distributions_spec=None,
                 entropy_regularization=None,
                 target_sync_frequency=10000,
                 target_update_weight=1.0,
                 huber_loss=None,
                 preprocessing=None,
                 exploration=None,
                 reward_preprocessing=None,
                 batched_observe=1000,
                 batch_size=32,
                 memory=None,
                 first_update=10000,
                 update_frequency=4,
                 repeat_update=1,
                 expert_margin=0.5,
                 supervised_weight=0.1,
                 demo_memory_capacity=10000,
                 demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec:
            actions_spec:
            network_spec:
            device:
            scope:
            saver_spec:
            summary_spec:
            distributed_spec:
            optimizer:
            discount:
            normalize_rewards:
            variable_noise:
            distributions_spec:
            entropy_regularization:
            target_sync_frequency:
            target_update_weight:
            double_q_model:
            huber_loss:
            preprocessing:
            exploration:
            reward_preprocessing:
            batched_observe:
            batch_size:
            memory:
            first_update:
            update_frequency:
            repeat_update:
            expert_margin:
            supervised_weight:
            demo_memory_capacity:
            demo_sampling_ratio:
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(type='adam', learning_rate=1e-3)
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(type='replay', capacity=100000)
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.double_q_model = double_q_model
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent,
              self).__init__(states_spec=states_spec,
                             actions_spec=actions_spec,
                             preprocessing=preprocessing,
                             exploration=exploration,
                             reward_preprocessing=reward_preprocessing,
                             batched_observe=batched_observe,
                             batch_size=batch_size,
                             memory=memory,
                             first_update=first_update,
                             update_frequency=update_frequency,
                             repeat_update=repeat_update)
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)
Пример #15
0
class DQFDAgent(MemoryAgent):
    """
    Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
    This agent uses DQN to pre-train from demonstration data.

    Configuration:

    Each agent requires the following configuration parameters:

    * `states`: dict containing one or more state definitions.
    * `actions`: dict containing one or more action definitions.
    * `preprocessing`: dict or list containing state preprocessing configuration.
    * `exploration`: dict containing action exploration configuration.

    Each model requires the following configuration parameters:

    * `discount`: float of discount factor (gamma).
    * `learning_rate`: float of learning rate (alpha).
    * `optimizer`: string of optimizer to use (e.g. 'adam').
    * `device`: string of tensorflow device name.
    * `tf_summary`: string directory to write tensorflow summaries. Default None
    * `tf_summary_level`: int indicating which tensorflow summaries to create.
    * `tf_summary_interval`: int number of calls to get_action until writing tensorflow summaries on update.
    * `log_level`: string containing logleve (e.g. 'info').
    * `distributed`: boolean indicating whether to use distributed tensorflow.
    * `global_model`: global model.
    * `session`: session to use.


    The `DQFDAgent` class additionally requires the following parameters:

    * `batch_size`: integer of the batch size.
    * `memory_capacity`: integer of maximum experiences to store.
    * `memory`: string indicating memory type ('replay' or 'prioritized_replay').
    * `min_replay_size`: integer of minimum replay size before the first update.
    * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps).
    * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps).
    * `use_target_network`: boolean indicating whether to use a target network.
    * `update_repeat`: integer of how many times to repeat an update.
    * `update_target_weight`: float of update target weight (tau parameter).
    * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from.
    * `supervised_weight`: float, weight of large margin classifier loss.
    * `expert_margin`: float of difference in Q-values between expert action and other actions enforced
                       by the large margin function.
    * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound


    """
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 scope='dqfd',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 optimizer=None,
                 discount=0.99,
                 normalize_rewards=False,
                 variable_noise=None,
                 distributions_spec=None,
                 entropy_regularization=None,
                 target_sync_frequency=10000,
                 target_update_weight=1.0,
                 huber_loss=None,
                 preprocessing=None,
                 exploration=None,
                 reward_preprocessing=None,
                 batched_observe=1000,
                 batch_size=32,
                 memory=None,
                 first_update=10000,
                 update_frequency=4,
                 repeat_update=1,
                 expert_margin=0.5,
                 supervised_weight=0.1,
                 demo_memory_capacity=10000,
                 demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec:
            actions_spec:
            network_spec:
            device:
            scope:
            saver_spec:
            summary_spec:
            distributed_spec:
            optimizer:
            discount:
            normalize_rewards:
            variable_noise:
            distributions_spec:
            entropy_regularization:
            target_sync_frequency:
            target_update_weight:
            double_q_model:
            huber_loss:
            preprocessing:
            exploration:
            reward_preprocessing:
            batched_observe:
            batch_size:
            memory:
            first_update:
            update_frequency:
            repeat_update:
            expert_margin:
            supervised_weight:
            demo_memory_capacity:
            demo_sampling_ratio:
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(type='adam', learning_rate=1e-3)
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(type='replay', capacity=100000)
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.double_q_model = double_q_model
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent,
              self).__init__(states_spec=states_spec,
                             actions_spec=actions_spec,
                             preprocessing=preprocessing,
                             exploration=exploration,
                             reward_preprocessing=reward_preprocessing,
                             batched_observe=batched_observe,
                             batch_size=batch_size,
                             memory=memory,
                             first_update=first_update,
                             update_frequency=update_frequency,
                             repeat_update=repeat_update)
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)

    def initialize_model(self, states_spec, actions_spec):
        return QDemoModel(
            states_spec=states_spec,
            actions_spec=actions_spec,
            network_spec=self.network_spec,
            device=self.device,
            scope=self.scope,
            saver_spec=self.saver_spec,
            summary_spec=self.summary_spec,
            distributed_spec=self.distributed_spec,
            optimizer=self.optimizer,
            discount=self.discount,
            normalize_rewards=self.normalize_rewards,
            variable_noise=self.variable_noise,
            distributions_spec=self.distributions_spec,
            entropy_regularization=self.entropy_regularization,
            target_sync_frequency=self.target_sync_frequency,
            target_update_weight=self.target_update_weight,
            double_q_model=self.double_q_model,
            huber_loss=self.huber_loss,
            # TEMP: Random sampling fix
            random_sampling_fix=True,
            expert_margin=self.expert_margin,
            supervised_weight=self.supervised_weight)

    def observe(self, reward, terminal):
        """
        Adds observations, updates via sampling from memories according to update rate.
        DQFD samples from the online replay memory and the demo memory with
        the fractions controlled by a hyper parameter p called 'expert sampling ratio.

        Args:
            reward:
            terminal:
        """
        super(DQFDAgent, self).observe(reward=reward, terminal=terminal)

        if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0:
            for _ in xrange(self.repeat_update):
                batch = self.demo_memory.get_batch(self.demo_batch_size)
                self.model.demonstration_update(batch=batch)

    def import_demonstrations(self, demonstrations):
        """
        Imports demonstrations, i.e. expert observations. Note that for large numbers of observations,
        set_demonstrations is more appropriate, which directly sets memory contents to an array an expects
        a different layout.

        Args:
            demonstrations: List of observation dicts
        """
        for observation in demonstrations:
            if self.unique_state:
                state = dict(state=observation['states'])
            else:
                state = observation['states']
            if self.unique_action:
                action = dict(action=observation['actions'])
            else:
                action = observation['actions']

            self.demo_memory.add_observation(
                states=state,
                internals=observation['internals'],
                actions=action,
                terminal=observation['terminal'],
                reward=observation['reward'])

    def set_demonstrations(self, batch):
        """
        Set all demonstrations from batch data. Expects a dict wherein each value contains an array
        containing all states, actions, rewards, terminals and internals respectively.

        Args:
            batch:

        """
        self.demo_memory.set_memory(states=batch['states'],
                                    internals=batch['internals'],
                                    actions=batch['actions'],
                                    terminal=batch['terminal'],
                                    reward=batch['reward'])

    def pretrain(self, steps):
        """
        Computes pretrain updates.

        Args:
            steps: Number of updates to execute.

        """
        for _ in xrange(steps):
            # Sample from demo memory.
            batch = self.demo_memory.get_batch(batch_size=self.batch_size,
                                               next_states=True)

            # Update using both double Q-learning and supervised double_q_loss.
            self.model.demonstration_update(batch)
Пример #16
0
class DQFDAgent(MemoryAgent):
    """
    Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
    This agent uses DQN to pre-train from demonstration data via an additional supervised loss term.
    """
    def __init__(
            self,
            states_spec,
            actions_spec,
            batched_observe=1000,
            scope='dqfd',
            # parameters specific to LearningAgents
            summary_spec=None,
            network_spec=None,
            device=None,
            session_config=None,
            saver_spec=None,
            distributed_spec=None,
            optimizer=None,
            discount=0.99,
            variable_noise=None,
            states_preprocessing_spec=None,
            explorations_spec=None,
            reward_preprocessing_spec=None,
            distributions_spec=None,
            entropy_regularization=None,
            # parameters specific to MemoryAgents
            batch_size=32,
            memory=None,
            first_update=10000,
            update_frequency=4,
            repeat_update=1,
            # parameters specific to DQFD agents
            target_sync_frequency=10000,
            target_update_weight=1.0,
            huber_loss=None,
            expert_margin=0.5,
            supervised_weight=0.1,
            demo_memory_capacity=10000,
            demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=batched_observe,
            scope=scope,
            # parameters specific to LearningAgent
            summary_spec=summary_spec,
            network_spec=network_spec,
            discount=discount,
            device=device,
            session_config=session_config,
            saver_spec=saver_spec,
            distributed_spec=distributed_spec,
            optimizer=optimizer,
            variable_noise=variable_noise,
            states_preprocessing_spec=states_preprocessing_spec,
            explorations_spec=explorations_spec,
            reward_preprocessing_spec=reward_preprocessing_spec,
            distributions_spec=distributions_spec,
            entropy_regularization=entropy_regularization,
            # parameters specific to MemoryAgents
            batch_size=batch_size,
            memory=memory,
            first_update=first_update,
            update_frequency=update_frequency,
            repeat_update=repeat_update)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_memory_capacity = demo_memory_capacity
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)

    def initialize_model(self):
        return QDemoModel(
            states_spec=self.states_spec,
            actions_spec=self.actions_spec,
            network_spec=self.network_spec,
            device=self.device,
            session_config=self.session_config,
            scope=self.scope,
            saver_spec=self.saver_spec,
            summary_spec=self.summary_spec,
            distributed_spec=self.distributed_spec,
            optimizer=self.optimizer,
            discount=self.discount,
            variable_noise=self.variable_noise,
            states_preprocessing_spec=self.states_preprocessing_spec,
            explorations_spec=self.explorations_spec,
            reward_preprocessing_spec=self.reward_preprocessing_spec,
            distributions_spec=self.distributions_spec,
            entropy_regularization=self.entropy_regularization,
            target_sync_frequency=self.target_sync_frequency,
            target_update_weight=self.target_update_weight,
            # DQFD always uses double dqn, which is a required key for a q-model.
            double_q_model=True,
            huber_loss=self.huber_loss,
            # TEMP: Random sampling fix
            random_sampling_fix=True,
            expert_margin=self.expert_margin,
            supervised_weight=self.supervised_weight)

    def observe(self, reward, terminal):
        """
        Adds observations, updates via sampling from memories according to update rate.
        DQFD samples from the online replay memory and the demo memory with
        the fractions controlled by a hyper parameter p called 'expert sampling ratio.
        """
        super(DQFDAgent, self).observe(reward=reward, terminal=terminal)

        if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0:
            for _ in xrange(self.repeat_update):
                batch = self.demo_memory.get_batch(
                    batch_size=self.demo_batch_size, next_states=True)
                self.model.demonstration_update(states={
                    name: np.stack(
                        (batch['states'][name], batch['next_states'][name]))
                    for name in batch['states']
                },
                                                internals=batch['internals'],
                                                actions=batch['actions'],
                                                terminal=batch['terminal'],
                                                reward=batch['reward'])

    def import_demonstrations(self, demonstrations):
        """
        Imports demonstrations, i.e. expert observations. Note that for large numbers of observations,
        set_demonstrations is more appropriate, which directly sets memory contents to an array an expects
        a different layout.

        Args:
            demonstrations: List of observation dicts
        """
        for observation in demonstrations:
            if self.unique_state:
                state = dict(state=observation['states'])
            else:
                state = observation['states']
            if self.unique_action:
                action = dict(action=observation['actions'])
            else:
                action = observation['actions']

            self.demo_memory.add_observation(
                states=state,
                internals=observation['internals'],
                actions=action,
                terminal=observation['terminal'],
                reward=observation['reward'])

    def set_demonstrations(self, batch):
        """
        Set all demonstrations from batch data. Expects a dict wherein each value contains an array
        containing all states, actions, rewards, terminals and internals respectively.

        Args:
            batch:

        """
        self.demo_memory.set_memory(states=batch['states'],
                                    internals=batch['internals'],
                                    actions=batch['actions'],
                                    terminal=batch['terminal'],
                                    reward=batch['reward'])

    def pretrain(self, steps):
        """
        Computes pre-train updates.

        Args:
            steps: Number of updates to execute.

        """
        for _ in xrange(steps):
            # Sample from demo memory.
            batch = self.demo_memory.get_batch(batch_size=self.batch_size,
                                               next_states=True)

            # Update using both double Q-learning and supervised double_q_loss.
            self.model.demonstration_update(states={
                name: np.stack(
                    (batch['states'][name], batch['next_states'][name]))
                for name in batch['states']
            },
                                            internals=batch['internals'],
                                            actions=batch['actions'],
                                            terminal=batch['terminal'],
                                            reward=batch['reward'])
Пример #17
0
class DQFDAgent(MemoryAgent):
    """
    Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
    This agent uses DQN to pre-train from demonstration data.

    Configuration:

    Each agent requires the following configuration parameters:

    * `states`: dict containing one or more state definitions.
    * `actions`: dict containing one or more action definitions.
    * `preprocessing`: dict or list containing state preprocessing configuration.
    * `exploration`: dict containing action exploration configuration.

    Each model requires the following configuration parameters:

    * `discount`: float of discount factor (gamma).
    * `learning_rate`: float of learning rate (alpha).
    * `optimizer`: string of optimizer to use (e.g. 'adam').
    * `device`: string of tensorflow device name.
    * `tf_summary`: string directory to write tensorflow summaries. Default None
    * `tf_summary_level`: int indicating which tensorflow summaries to create.
    * `tf_summary_interval`: int number of calls to get_action until writing tensorflow summaries on update.
    * `log_level`: string containing logleve (e.g. 'info').
    * `distributed`: boolean indicating whether to use distributed tensorflow.
    * `global_model`: global model.
    * `session`: session to use.


    The `DQFDAgent` class additionally requires the following parameters:


    * `batch_size`: integer of the batch size.
    * `memory_capacity`: integer of maximum experiences to store.
    * `memory`: string indicating memory type ('replay' or 'prioritized_replay').
    * `min_replay_size`: integer of minimum replay size before the first update.
    * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps).
    * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps).
    * `use_target_network`: boolean indicating whether to use a target network.
    * `update_repeat`: integer of how many times to repeat an update.
    * `update_target_weight`: float of update target weight (tau parameter).
    * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from.
    * `supervised_weight`: float, weight of large margin classifier loss.
    * `expert_margin`: float of difference in Q-values between expert action and other actions enforced
                       by the large margin function.
    * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound


    """
    default_config = dict(
        # Agent
        preprocessing=None,
        exploration=None,
        reward_preprocessing=None,
        # Model
        optimizer=dict(
            type='adam',
            learning_rate=1e-3
        ),
        discount=0.99,
        normalize_rewards=False,
        variable_noise=None,  # not documented!!!
        # DistributionModel
        distributions=None,  # not documented!!!
        entropy_regularization=None,
        # QModel
        target_sync_frequency=10000,  # not documented!!!
        target_update_weight=1.0,  # not documented!!!
        huber_loss=0.0,  # not documented!!!
        # Logging
        log_level='info',
        model_directory=None,
        save_frequency=600,  # TensorFlow default
        summary_labels=['total-loss'],
        summary_frequency=120,  # TensorFlow default
        # TensorFlow distributed configuration
        cluster_spec=None,
        parameter_server=False,
        task_index=0,
        device=None,
        local_model=False,
        replica_model=False,
        scope='dqfd'
    )

    def __init__(self, states_spec, actions_spec, network_spec, config):
        self.network_spec = network_spec
        config = config.copy()
        config.default(DQFDAgent.default_config)

        # DQFD always uses double dqn, which is a required key for a q-model.
        config.obligatory(double_dqn=True)
        self.target_update_frequency = config.target_update_frequency
        self.demo_memory_capacity = config.demo_memory_capacity

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec)

        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            config=config
        )

    def observe(self, reward, terminal):
        """
        Adds observations, updates via sampling from memories according to update rate.
        DQFD samples from the online replay memory and the demo memory with
        the fractions controlled by a hyper parameter p called 'expert sampling ratio.

        Args:
            reward:
            terminal:
        """
        super(DQFDAgent, self).observe(reward=reward, terminal=terminal)

        if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0:
            for _ in xrange(self.repeat_update):
                batch = self.demo_memory.get_batch(self.demo_batch_size)
                self.model.demonstration_update(batch=batch)

    def import_demonstrations(self, demonstrations):
        """
        Imports demonstrations, i.e. expert observations. Note that for large numbers of observations,
        set_demonstrations is more appropriate, which directly sets memory contents to an array an expects
        a different layout.

        Args:
            demonstrations: List of observation dicts
        """
        for observation in demonstrations:
            if self.unique_state:
                state = dict(state=observation['states'])
            else:
                state = observation['states']
            if self.unique_action:
                action = dict(action=observation['actions'])
            else:
                action = observation['actions']

            self.demo_memory.add_observation(
                states=state,
                internals=observation['internal'],
                actions=action,
                terminal=observation['terminal'],
                reward=observation['reward']
            )

    def set_demonstrations(self, batch):
        """
        Set all demonstrations from batch data. Expects a dict wherein each value contains an array
        containing all states, actions, rewards, terminals and internals respectively.

        Args:
            batch:

        """
        self.demo_memory.set_memory(
            states=batch['states'],
            internals=batch['internals'],
            actions=batch['actions'],
            terminal=batch['terminal'],
            reward=batch['reward']
        )

    def initialize_model(self, states_spec, actions_spec, config):
        return QDemoModel(
            states_spec=states_spec,
            actions_spec=actions_spec,
            network_spec=self.network_spec,
            config=config
        )

    def pretrain(self, steps):
        """
        Computes pretrain updates.

        Args:
            steps: Number of updates to execute.

        """
        for _ in xrange(steps):
            # Sample from demo memory.
            batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True)

            # Update using both double Q-learning and supervised double_q_loss.
            self.model.demonstration_update(batch)
Пример #18
0
class DQFDAgent(MemoryAgent):
    """
    Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
    This agent uses DQN to pre-train from demonstration data via an additional supervised loss term.
    """

    def __init__(
        self,
        states_spec,
        actions_spec,
        network_spec,
        device=None,
        scope='dqfd',
        saver_spec=None,
        summary_spec=None,
        distributed_spec=None,
        optimizer=None,
        discount=0.99,
        normalize_rewards=False,
        variable_noise=None,
        distributions_spec=None,
        entropy_regularization=None,
        target_sync_frequency=10000,
        target_update_weight=1.0,
        huber_loss=None,
        preprocessing=None,
        exploration=None,
        reward_preprocessing=None,
        batched_observe=1000,
        batch_size=32,
        memory=None,
        first_update=10000,
        update_frequency=4,
        repeat_update=1,
        expert_margin=0.5,
        supervised_weight=0.1,
        demo_memory_capacity=10000,
        demo_sampling_ratio=0.2
    ):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`.
                Available optimizer types include standard TensorFlow optimizers, `natural_gradient`,
                and `evolutionary`. Consult the optimizer test or example configurations for more.
            discount: Float specifying reward discount factor.
            normalize_rewards: Boolean flag specifying whether to normalize rewards, default False.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each
                preprocessor is a dict containing a type and optional necessary arguments.
            exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise)
                and arguments.
            reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size.
            memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`.
            first_update: Int describing at which time step the first update is performed. Should be larger
                than batch size.
            update_frequency: Int specifying number of observe steps to perform until an update is executed.
            repeat_update: Int specifying how many update steps are performed per update, where each update step implies
                sampling a batch from the memory and passing it to the model.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(
                type='adam',
                learning_rate=1e-3
            )
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(
                type='replay',
                capacity=100000
            )
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            preprocessing=preprocessing,
            exploration=exploration,
            reward_preprocessing=reward_preprocessing,
            batched_observe=batched_observe,
            batch_size=batch_size,
            memory=memory,
            first_update=first_update,
            update_frequency=update_frequency,
            repeat_update=repeat_update
        )
        self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)

    def initialize_model(self, states_spec, actions_spec):
        return QDemoModel(
            states_spec=states_spec,
            actions_spec=actions_spec,
            network_spec=self.network_spec,
            device=self.device,
            scope=self.scope,
            saver_spec=self.saver_spec,
            summary_spec=self.summary_spec,
            distributed_spec=self.distributed_spec,
            optimizer=self.optimizer,
            discount=self.discount,
            normalize_rewards=self.normalize_rewards,
            variable_noise=self.variable_noise,
            distributions_spec=self.distributions_spec,
            entropy_regularization=self.entropy_regularization,
            target_sync_frequency=self.target_sync_frequency,
            target_update_weight=self.target_update_weight,
            double_q_model=self.double_q_model,
            huber_loss=self.huber_loss,
            # TEMP: Random sampling fix
            random_sampling_fix=True,
            expert_margin=self.expert_margin,
            supervised_weight=self.supervised_weight
        )

    def observe(self, reward, terminal):
        """
        Adds observations, updates via sampling from memories according to update rate.
        DQFD samples from the online replay memory and the demo memory with
        the fractions controlled by a hyper parameter p called 'expert sampling ratio.

        Args:
            reward:
            terminal:
        """
        super(DQFDAgent, self).observe(reward=reward, terminal=terminal)

        if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0:
            for _ in xrange(self.repeat_update):
                batch = self.demo_memory.get_batch(batch_size=self.demo_batch_size, next_states=True)
                self.model.demonstration_update(
                    states={name: np.stack((batch['states'][name], batch['next_states'][name])) for name in batch['states']},
                    internals=batch['internals'],
                    actions=batch['actions'],
                    terminal=batch['terminal'],
                    reward=batch['reward']
                )

    def import_demonstrations(self, demonstrations):
        """
        Imports demonstrations, i.e. expert observations. Note that for large numbers of observations,
        set_demonstrations is more appropriate, which directly sets memory contents to an array an expects
        a different layout.

        Args:
            demonstrations: List of observation dicts
        """
        for observation in demonstrations:
            if self.unique_state:
                state = dict(state=observation['states'])
            else:
                state = observation['states']
            if self.unique_action:
                action = dict(action=observation['actions'])
            else:
                action = observation['actions']

            self.demo_memory.add_observation(
                states=state,
                internals=observation['internals'],
                actions=action,
                terminal=observation['terminal'],
                reward=observation['reward']
            )

    def set_demonstrations(self, batch):
        """
        Set all demonstrations from batch data. Expects a dict wherein each value contains an array
        containing all states, actions, rewards, terminals and internals respectively.

        Args:
            batch:

        """
        self.demo_memory.set_memory(
            states=batch['states'],
            internals=batch['internals'],
            actions=batch['actions'],
            terminal=batch['terminal'],
            reward=batch['reward']
        )

    def pretrain(self, steps):
        """
        Computes pretrain updates.

        Args:
            steps: Number of updates to execute.

        """
        for _ in xrange(steps):
            # Sample from demo memory.
            batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True)

            # Update using both double Q-learning and supervised double_q_loss.
            self.model.demonstration_update(
                states={name: np.stack((batch['states'][name], batch['next_states'][name])) for name in batch['states']},
                internals=batch['internals'],
                actions=batch['actions'],
                terminal=batch['terminal'],
                reward=batch['reward']
            )
Пример #19
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 session_config=None,
                 scope='dqfd',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 optimizer=None,
                 discount=0.99,
                 variable_noise=None,
                 states_preprocessing_spec=None,
                 explorations_spec=None,
                 reward_preprocessing_spec=None,
                 distributions_spec=None,
                 entropy_regularization=None,
                 target_sync_frequency=10000,
                 target_update_weight=1.0,
                 huber_loss=None,
                 batched_observe=1000,
                 batch_size=32,
                 memory=None,
                 first_update=10000,
                 update_frequency=4,
                 repeat_update=1,
                 expert_margin=0.5,
                 supervised_weight=0.1,
                 demo_memory_capacity=10000,
                 demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`.
                Available optimizer types include standard TensorFlow optimizers, `natural_gradient`,
                and `evolutionary`. Consult the optimizer test or example configurations for more.
            discount: Float specifying reward discount factor.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            states_preprocessing_spec: Optional list of states preprocessors to apply to state  
                (e.g. `image_resize`, `grayscale`).
            explorations_spec: Optional dict specifying action exploration type (epsilon greedy  
                or Gaussian noise).
            reward_preprocessing_spec: Optional dict specifying reward preprocessing.
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size.
            memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`.
            first_update: Int describing at which time step the first update is performed. Should be larger
                than batch size.
            update_frequency: Int specifying number of observe steps to perform until an update is executed.
            repeat_update: Int specifying how many update steps are performed per update, where each update step implies
                sampling a batch from the memory and passing it to the model.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(type='adam', learning_rate=1e-3)
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(type='replay', capacity=100000)
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.session_config = session_config
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.variable_noise = variable_noise
        self.states_preprocessing_spec = states_preprocessing_spec
        self.explorations_spec = explorations_spec
        self.reward_preprocessing_spec = reward_preprocessing_spec
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent, self).__init__(states_spec=states_spec,
                                        actions_spec=actions_spec,
                                        batched_observe=batched_observe,
                                        batch_size=batch_size,
                                        memory=memory,
                                        first_update=first_update,
                                        update_frequency=update_frequency,
                                        repeat_update=repeat_update)
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)
Пример #20
0
class DQFDAgent(MemoryAgent):
    """
    Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
    This agent uses DQN to pre-train from demonstration data.

    Configuration:

    Each agent requires the following configuration parameters:

    * `states`: dict containing one or more state definitions.
    * `actions`: dict containing one or more action definitions.
    * `preprocessing`: dict or list containing state preprocessing configuration.
    * `exploration`: dict containing action exploration configuration.

    Each model requires the following configuration parameters:

    * `discount`: float of discount factor (gamma).
    * `learning_rate`: float of learning rate (alpha).
    * `optimizer`: string of optimizer to use (e.g. 'adam').
    * `device`: string of tensorflow device name.
    * `tf_summary`: boolean indicating whether to use tensorflow summary file writer.
    * `log_level`: string containing logleve (e.g. 'info').
    * `distributed`: boolean indicating whether to use distributed tensorflow.
    * `global_model`: global model.
    * `session`: session to use.


    The `DQFDAgent` class additionally requires the following parameters:


    * `batch_size`: integer of the batch size.
    * `memory_capacity`: integer of maximum experiences to store.
    * `memory`: string indicating memory type ('replay' or 'prioritized_replay').
    * `min_replay_size`: integer of minimum replay size before the first update.
    * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps).
    * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps).
    * `use_target_network`: boolean indicating whether to use a target network.
    * `update_repeat`: integer of how many times to repeat an update.
    * `update_target_weight`: float of update target weight (tau parameter).
    * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from.
    * `supervised_weight`: float, weight of large margin classifier loss.
    * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function.
    * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound


    """

    name = 'DQFDAgent'
    model = DQFDModel

    default_config = dict(target_update_frequency=10000,
                          demo_memory_capacity=1000000,
                          demo_sampling_ratio=0.01)

    def __init__(self, config, model=None):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config, model)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states,
                                  config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio *
                                   config.batch_size /
                                   (1.0 - config.demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format(
            self.demo_batch_size)

    def observe(self, reward, terminal):
        """Adds observations, updates via sampling from memories according to update rate.
        DQFD samples from the online replay memory and the demo memory with
        the fractions controlled by a hyper parameter p called 'expert sampling ratio.

        Args:
            reward:
            terminal:

        Returns:

        """
        super(DQFDAgent, self).observe(reward=reward, terminal=terminal)

        if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0:
            for _ in xrange(self.repeat_update):
                batch = self.demo_memory.get_batch(self.demo_batch_size)
                self.model.demonstration_update(batch=batch)

        if self.timestep >= self.first_update and self.timestep % self.target_update_frequency == 0:
            self.model.update_target()

    def import_demonstrations(self, demonstrations):
        """Imports demonstrations, i.e. expert observations

        Args:
            demonstrations:

        Returns:

        """
        for observation in demonstrations:
            if self.unique_state:
                state = dict(state=observation['state'])
            else:
                state = observation['state']
            if self.unique_action:
                action = dict(action=observation['action'])
            else:
                action = observation['action']
            self.demo_memory.add_observation(state=state,
                                             action=action,
                                             reward=observation['reward'],
                                             terminal=observation['terminal'],
                                             internal=observation['internal'])

    def pretrain(self, steps):
        """Computes pretrain updates.

        Args:
            steps: Number of updates to execute.

        Returns:

        """
        for _ in xrange(steps):
            # Sample from demo memory
            batch = self.demo_memory.get_batch(batch_size=self.batch_size,
                                               next_states=True)

            # Update using both double Q-learning and supervised double_q_loss
            self.model.demonstration_update(batch)
Пример #21
0
class DQFDAgent(MemoryAgent):
    """
    Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
    This agent uses DQN to pre-train from demonstration data.

    Configuration:

    Each agent requires the following configuration parameters:

    * `states`: dict containing one or more state definitions.
    * `actions`: dict containing one or more action definitions.
    * `preprocessing`: dict or list containing state preprocessing configuration.
    * `exploration`: dict containing action exploration configuration.

    Each model requires the following configuration parameters:

    * `discount`: float of discount factor (gamma).
    * `learning_rate`: float of learning rate (alpha).
    * `optimizer`: string of optimizer to use (e.g. 'adam').
    * `optimizer_args`: list of arguments for optimizer.
    * `optimizer_kwargs`: dict of keyword arguments for optimizer.
    * `device`: string of tensorflow device name.
    * `tf_saver`: boolean whether to save model parameters.
    * `tf_summary`: boolean indicating whether to use tensorflow summary file writer.
    * `log_level`: string containing logleve (e.g. 'info').
    * `distributed`: boolean indicating whether to use distributed tensorflow.
    * `global_model`: global model.
    * `session`: session to use.


    The `DQFDAgent` class additionally requires the following parameters:


    * `batch_size`: integer of the batch size.
    * `memory_capacity`: integer of maximum experiences to store.
    * `memory`: string indicating memory type ('replay' or 'prioritized_replay').
    * `memory_args`: list of arguments to pass to replay memory constructor.
    * `memory_kwargs`: list of keyword arguments to pass to replay memory constructor.
    * `min_replay_size`: integer of minimum replay size before the first update.
    * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps).
    * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps).
    * `use_target_network`: boolean indicating whether to use a target network.
    * `update_repeat`: integer of how many times to repeat an update.
    * `update_target_weight`: float of update target weight (tau parameter).
    * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from.
    * `supervised_weight`: float, weight of large margin classifier loss.
    * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function.
    * `clip_gradients`: float of maximum values for gradients before clipping.


    """

    name = 'DQFDAgent'
    model = DQFDModel
    default_config = dict(
        target_update_frequency=10000,
        demo_memory_capacity=1000000,
        demo_sampling_ratio=0.01
    )

    def __init__(self, config):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive.' \
                                         '(Calculated {} based on current parameters)'.format(self.demo_batch_size)

    def observe(self, reward, terminal):
        """Adds observations, updates via sampling from memories according to update rate.
        DQFD samples from the online replay memory and the demo memory with
        the fractions controlled by a hyper parameter p called 'expert sampling ratio.
        
        Args:
            reward: 
            terminal: 

        Returns:

        """
        super(DQFDAgent, self).observe(reward=reward, terminal=terminal)

        if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0:
            for _ in xrange(self.repeat_update):
                batch = self.demo_memory.get_batch(self.demo_batch_size)
                self.model.demonstration_update(batch=batch)

        if self.timestep >= self.first_update and self.timestep % self.target_update_frequency == 0:
            self.model.update_target()

    def import_demonstrations(self, demonstrations):
        """Imports demonstrations, i.e. expert observations

        Args:
            demonstrations: 

        Returns:

        """
        for observation in demonstrations:
            if self.unique_state:
                state = dict(state=observation['state'])
            else:
                state = observation['state']
            if self.unique_action:
                action = dict(action=observation['action'])
            else:
                action = observation['action']
            self.demo_memory.add_observation(
                state=state,
                action=action,
                reward=observation['reward'],
                terminal=observation['terminal'],
                internal=observation['internal']
            )

    def pretrain(self, steps):
        """Computes pretrain updates.
        
        Args:
            steps: Number of updates to execute.

        Returns:

        """
        for _ in xrange(steps):
            # Sample from demo memory
            batch = self.demo_memory.get_batch(self.batch_size)

            # Update using both double Q-learning and supervised double_q_loss
            self.model.demonstration_update(batch)