示例#1
0
    def initialize(self, custom_getter):
        super(QDemoModel, self).initialize(custom_getter=custom_getter)

        self.demo_memory = Replay(states=self.states_spec,
                                  internals=self.internals_spec,
                                  actions=self.actions_spec,
                                  include_next_states=True,
                                  capacity=self.demo_memory_capacity,
                                  scope='demo-replay',
                                  summary_labels=self.summary_labels)

        # Import demonstration optimization.
        self.fn_import_demo_experience = tf.make_template(
            name_='import-demo-experience',
            func_=self.tf_import_demo_experience,
            custom_getter_=custom_getter)

        # Demonstration loss.
        self.fn_demo_loss = tf.make_template(name_='demo-loss',
                                             func_=self.tf_demo_loss,
                                             custom_getter_=custom_getter)

        # Combined loss.
        self.fn_combined_loss = tf.make_template(name_='combined-loss',
                                                 func_=self.tf_combined_loss,
                                                 custom_getter_=custom_getter)

        # Demonstration optimization.
        self.fn_demo_optimization = tf.make_template(
            name_='demo-optimization',
            func_=self.tf_demo_optimization,
            custom_getter_=custom_getter)
示例#2
0
    def __init__(self, states_spec, actions_spec, network_spec, config):
        self.network_spec = network_spec
        config = config.copy()
        config.default(DQFDAgent.default_config)

        # DQFD always uses double dqn, which is a required key for a q-model.
        config.obligatory(double_dqn=True)
        self.target_update_frequency = config.target_update_frequency
        self.demo_memory_capacity = config.demo_memory_capacity

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec)

        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            config=config
        )
示例#3
0
 def __init__(self, config):
     config.default(PPOModel.default_config)
     super(PPOModel, self).__init__(config)
     self.epochs = config.epochs
     self.optimizer_batch_size = config.optimizer_batch_size
     # Use replay memory so memory logic can be used to sample batches
     self.memory = Replay(config.batch_size, config.states, config.actions,
                          config.random_sampling)
示例#4
0
 def __init__(self, config):
     config.default(PPOModel.default_config)
     super(PPOModel, self).__init__(config)
     self.optimizer_batch_size = config.optimizer_batch_size
     self.batch_size = config.batch_size
     self.updates = int(
         config.batch_size / self.optimizer_batch_size) * config.epochs
     if self.batch_size % self.optimizer_batch_size != 0:
         raise TensorForceError(
             'batch_size must be a multiple of optimizer_batch_size')
     # Use replay memory as a cache so it can be used to sample minibatches
     self.memory = Replay(config.batch_size, config.states, config.actions,
                          config.random_sampling)
示例#5
0
    def __init__(self, config, model=None):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config, model)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format(self.demo_batch_size)
示例#6
0
    def __init__(self, config):
        config.default(PPOModel.default_config)
        super(PPOModel, self).__init__(config)
        self.optimizer_batch_size = config.optimizer_batch_size
        # Use replay memory so memory logic can be used to sample batches

        if self.optimizer_batch_size > config.batch_size:
            raise Exception(
                "optimizer_batch_size > batch_size ({}, {})".format(
                    self.optimizer_batch_size, config.batch_size))
        self.updates = int(
            config.batch_size / self.optimizer_batch_size) * config.epochs
        self.memory = Replay(config.batch_size, config.states, config.actions,
                             config.random_sampling)
示例#7
0
    def __init__(self, config):
        config.default(DQFDAgent.default_config)
        super(DQFDAgent, self).__init__(config)
        self.target_update_frequency = config.target_update_frequency

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(config.demo_memory_capacity, config.states,
                                  config.actions)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(config.demo_sampling_ratio *
                                   config.batch_size /
                                   (1.0 - config.demo_sampling_ratio))
示例#8
0
    def setup_components_and_tf_funcs(self, custom_getter=None):
        """
        Constructs the extra Replay memory.
        """
        custom_getter = super(QDemoModel, self).setup_components_and_tf_funcs(custom_getter)

        self.demo_memory = Replay(
            states=self.states_spec,
            internals=self.internals_spec,
            actions=self.actions_spec,
            include_next_states=True,
            capacity=self.demo_memory_capacity,
            scope='demo-replay',
            summary_labels=self.summary_labels
        )

        # Import demonstration optimization.
        self.fn_import_demo_experience = tf.make_template(
            name_='import-demo-experience',
            func_=self.tf_import_demo_experience,
            custom_getter_=custom_getter
        )

        # Demonstration loss.
        self.fn_demo_loss = tf.make_template(
            name_='demo-loss',
            func_=self.tf_demo_loss,
            custom_getter_=custom_getter
        )

        # Combined loss.
        self.fn_combined_loss = tf.make_template(
            name_='combined-loss',
            func_=self.tf_combined_loss,
            custom_getter_=custom_getter
        )

        # Demonstration optimization.
        self.fn_demo_optimization = tf.make_template(
            name_='demo-optimization',
            func_=self.tf_demo_optimization,
            custom_getter_=custom_getter
        )

        return custom_getter
示例#9
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 scope='dqfd',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 optimizer=None,
                 discount=0.99,
                 normalize_rewards=False,
                 variable_noise=None,
                 distributions_spec=None,
                 entropy_regularization=None,
                 target_sync_frequency=10000,
                 target_update_weight=1.0,
                 huber_loss=None,
                 preprocessing=None,
                 exploration=None,
                 reward_preprocessing=None,
                 batched_observe=1000,
                 batch_size=32,
                 memory=None,
                 first_update=10000,
                 update_frequency=4,
                 repeat_update=1,
                 expert_margin=0.5,
                 supervised_weight=0.1,
                 demo_memory_capacity=10000,
                 demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec:
            actions_spec:
            network_spec:
            device:
            scope:
            saver_spec:
            summary_spec:
            distributed_spec:
            optimizer:
            discount:
            normalize_rewards:
            variable_noise:
            distributions_spec:
            entropy_regularization:
            target_sync_frequency:
            target_update_weight:
            double_q_model:
            huber_loss:
            preprocessing:
            exploration:
            reward_preprocessing:
            batched_observe:
            batch_size:
            memory:
            first_update:
            update_frequency:
            repeat_update:
            expert_margin:
            supervised_weight:
            demo_memory_capacity:
            demo_sampling_ratio:
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(type='adam', learning_rate=1e-3)
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(type='replay', capacity=100000)
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.normalize_rewards = normalize_rewards
        self.variable_noise = variable_noise
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.double_q_model = double_q_model
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent,
              self).__init__(states_spec=states_spec,
                             actions_spec=actions_spec,
                             preprocessing=preprocessing,
                             exploration=exploration,
                             reward_preprocessing=reward_preprocessing,
                             batched_observe=batched_observe,
                             batch_size=batch_size,
                             memory=memory,
                             first_update=first_update,
                             update_frequency=update_frequency,
                             repeat_update=repeat_update)
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)
示例#10
0
    def __init__(self,
                 states_spec,
                 actions_spec,
                 network_spec,
                 device=None,
                 session_config=None,
                 scope='dqfd',
                 saver_spec=None,
                 summary_spec=None,
                 distributed_spec=None,
                 optimizer=None,
                 discount=0.99,
                 variable_noise=None,
                 states_preprocessing_spec=None,
                 explorations_spec=None,
                 reward_preprocessing_spec=None,
                 distributions_spec=None,
                 entropy_regularization=None,
                 target_sync_frequency=10000,
                 target_update_weight=1.0,
                 huber_loss=None,
                 batched_observe=1000,
                 batch_size=32,
                 memory=None,
                 first_update=10000,
                 update_frequency=4,
                 repeat_update=1,
                 expert_margin=0.5,
                 supervised_weight=0.1,
                 demo_memory_capacity=10000,
                 demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            states_spec: Dict containing at least one state definition. In the case of a single state,
               keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state
               is a dict itself with a unique name as its key.
            actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions`
                for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more.
            network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments
                such as activation or regularisation. Full examples are in the examples/configs folder.
            device: Device string specifying model device.
            session_config: optional tf.ConfigProto with additional desired session configurations
            scope: TensorFlow scope, defaults to agent name (e.g. `dqn`).
            saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use
                either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies
                if a model is initially loaded (set to True) from a file `file`.
            summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps`
                or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values
                to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels.
            distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model`
                Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow
                cluster spec.
            optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`.
                Available optimizer types include standard TensorFlow optimizers, `natural_gradient`,
                and `evolutionary`. Consult the optimizer test or example configurations for more.
            discount: Float specifying reward discount factor.
            variable_noise: Experimental optional parameter specifying variable noise (NoisyNet).
            states_preprocessing_spec: Optional list of states preprocessors to apply to state  
                (e.g. `image_resize`, `grayscale`).
            explorations_spec: Optional dict specifying action exploration type (epsilon greedy  
                or Gaussian noise).
            reward_preprocessing_spec: Optional dict specifying reward preprocessing.
            distributions_spec: Optional dict specifying action distributions to override default distribution choices.
                Must match action names.
            entropy_regularization: Optional positive float specifying an entropy regularization value.
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            batched_observe: Optional int specifying how many observe calls are batched into one session run.
                Without batching, throughput will be lower because every `observe` triggers a session invocation to
                update rewards in the graph.
            batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size.
            memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`.
            first_update: Int describing at which time step the first update is performed. Should be larger
                than batch size.
            update_frequency: Int specifying number of observe steps to perform until an update is executed.
            repeat_update: Int specifying how many update steps are performed per update, where each update step implies
                sampling a batch from the memory and passing it to the model.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        if network_spec is None:
            raise TensorForceError("No network_spec provided.")

        if optimizer is None:
            self.optimizer = dict(type='adam', learning_rate=1e-3)
        else:
            self.optimizer = optimizer
        if memory is None:
            memory = dict(type='replay', capacity=100000)
        else:
            self.memory = memory

        self.network_spec = network_spec
        self.device = device
        self.session_config = session_config
        self.scope = scope
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec
        self.discount = discount
        self.variable_noise = variable_noise
        self.states_preprocessing_spec = states_preprocessing_spec
        self.explorations_spec = explorations_spec
        self.reward_preprocessing_spec = reward_preprocessing_spec
        self.distributions_spec = distributions_spec
        self.entropy_regularization = entropy_regularization
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        # DQFD always uses double dqn, which is a required key for a q-model.
        self.double_q_model = True
        self.target_sync_frequency = target_sync_frequency
        self.demo_memory_capacity = demo_memory_capacity
        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))

        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        super(DQFDAgent, self).__init__(states_spec=states_spec,
                                        actions_spec=actions_spec,
                                        batched_observe=batched_observe,
                                        batch_size=batch_size,
                                        memory=memory,
                                        first_update=first_update,
                                        update_frequency=update_frequency,
                                        repeat_update=repeat_update)
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)
示例#11
0
    def __init__(
            self,
            states_spec,
            actions_spec,
            batched_observe=1000,
            scope='dqfd',
            # parameters specific to LearningAgents
            summary_spec=None,
            network_spec=None,
            device=None,
            session_config=None,
            saver_spec=None,
            distributed_spec=None,
            optimizer=None,
            discount=0.99,
            variable_noise=None,
            states_preprocessing_spec=None,
            explorations_spec=None,
            reward_preprocessing_spec=None,
            distributions_spec=None,
            entropy_regularization=None,
            # parameters specific to MemoryAgents
            batch_size=32,
            memory=None,
            first_update=10000,
            update_frequency=4,
            repeat_update=1,
            # parameters specific to DQFD agents
            target_sync_frequency=10000,
            target_update_weight=1.0,
            huber_loss=None,
            expert_margin=0.5,
            supervised_weight=0.1,
            demo_memory_capacity=10000,
            demo_sampling_ratio=0.2):
        """
        Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)).
        This agent uses DQN to pre-train from demonstration data in combination with a supervised loss.

        Args:
            target_sync_frequency: Interval between optimization calls synchronizing the target network.
            target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network.
            huber_loss: Optional flat specifying Huber-loss clipping.
            expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other
                Q-values.
            supervised_weight: Weight of supervised loss term.
            demo_memory_capacity: Int describing capacity of expert demonstration memory.
            demo_sampling_ratio: Runtime sampling ratio of expert data.
        """
        self.target_sync_frequency = target_sync_frequency
        self.target_update_weight = target_update_weight
        self.huber_loss = huber_loss

        self.expert_margin = expert_margin
        self.supervised_weight = supervised_weight

        super(DQFDAgent, self).__init__(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=batched_observe,
            scope=scope,
            # parameters specific to LearningAgent
            summary_spec=summary_spec,
            network_spec=network_spec,
            discount=discount,
            device=device,
            session_config=session_config,
            saver_spec=saver_spec,
            distributed_spec=distributed_spec,
            optimizer=optimizer,
            variable_noise=variable_noise,
            states_preprocessing_spec=states_preprocessing_spec,
            explorations_spec=explorations_spec,
            reward_preprocessing_spec=reward_preprocessing_spec,
            distributions_spec=distributions_spec,
            entropy_regularization=entropy_regularization,
            # parameters specific to MemoryAgents
            batch_size=batch_size,
            memory=memory,
            first_update=first_update,
            update_frequency=update_frequency,
            repeat_update=repeat_update)

        # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples
        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_memory_capacity = demo_memory_capacity
        self.demo_batch_size = int(demo_sampling_ratio * batch_size /
                                   (1.0 - demo_sampling_ratio))
        assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \
                                         'demo_batch_size is positive. (Calculated {} based on current' \
                                         ' parameters)'.format(self.demo_batch_size)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        self.demo_memory = Replay(self.states_spec, self.actions_spec,
                                  self.demo_memory_capacity)