def initialize(self, custom_getter): super(QDemoModel, self).initialize(custom_getter=custom_getter) self.demo_memory = Replay(states=self.states_spec, internals=self.internals_spec, actions=self.actions_spec, include_next_states=True, capacity=self.demo_memory_capacity, scope='demo-replay', summary_labels=self.summary_labels) # Import demonstration optimization. self.fn_import_demo_experience = tf.make_template( name_='import-demo-experience', func_=self.tf_import_demo_experience, custom_getter_=custom_getter) # Demonstration loss. self.fn_demo_loss = tf.make_template(name_='demo-loss', func_=self.tf_demo_loss, custom_getter_=custom_getter) # Combined loss. self.fn_combined_loss = tf.make_template(name_='combined-loss', func_=self.tf_combined_loss, custom_getter_=custom_getter) # Demonstration optimization. self.fn_demo_optimization = tf.make_template( name_='demo-optimization', func_=self.tf_demo_optimization, custom_getter_=custom_getter)
def __init__(self, states_spec, actions_spec, network_spec, config): self.network_spec = network_spec config = config.copy() config.default(DQFDAgent.default_config) # DQFD always uses double dqn, which is a required key for a q-model. config.obligatory(double_dqn=True) self.target_update_frequency = config.target_update_frequency self.demo_memory_capacity = config.demo_memory_capacity # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec) super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, config=config )
def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.epochs = config.epochs self.optimizer_batch_size = config.optimizer_batch_size # Use replay memory so memory logic can be used to sample batches self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling)
def __init__(self, config, model=None): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config, model) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format(self.demo_batch_size)
def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.optimizer_batch_size = config.optimizer_batch_size self.batch_size = config.batch_size self.updates = int( config.batch_size / self.optimizer_batch_size) * config.epochs if self.batch_size % self.optimizer_batch_size != 0: raise TensorForceError( 'batch_size must be a multiple of optimizer_batch_size') # Use replay memory as a cache so it can be used to sample minibatches self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling)
def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.optimizer_batch_size = config.optimizer_batch_size # Use replay memory so memory logic can be used to sample batches if self.optimizer_batch_size > config.batch_size: raise Exception( "optimizer_batch_size > batch_size ({}, {})".format( self.optimizer_batch_size, config.batch_size)) self.updates = int( config.batch_size / self.optimizer_batch_size) * config.epochs self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling)
def __init__(self, config): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))
def setup_components_and_tf_funcs(self, custom_getter=None): """ Constructs the extra Replay memory. """ custom_getter = super(QDemoModel, self).setup_components_and_tf_funcs(custom_getter) self.demo_memory = Replay( states=self.states_spec, internals=self.internals_spec, actions=self.actions_spec, include_next_states=True, capacity=self.demo_memory_capacity, scope='demo-replay', summary_labels=self.summary_labels ) # Import demonstration optimization. self.fn_import_demo_experience = tf.make_template( name_='import-demo-experience', func_=self.tf_import_demo_experience, custom_getter_=custom_getter ) # Demonstration loss. self.fn_demo_loss = tf.make_template( name_='demo-loss', func_=self.tf_demo_loss, custom_getter_=custom_getter ) # Combined loss. self.fn_combined_loss = tf.make_template( name_='combined-loss', func_=self.tf_combined_loss, custom_getter_=custom_getter ) # Demonstration optimization. self.fn_demo_optimization = tf.make_template( name_='demo-optimization', func_=self.tf_demo_optimization, custom_getter_=custom_getter ) return custom_getter
def __init__(self, config): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive.' \ '(Calculated {} based on current parameters)'.format(self.demo_batch_size)
class PPOModel(PolicyGradientModel): allows_discrete_actions = True allows_continuous_actions = True default_config = dict( entropy_penalty=0.01, loss_clipping=0.1, # Trust region clipping epochs=10, # Number of training epochs for SGD, optimizer_batch_size=128, # Batch size for optimiser random_sampling=True # Sampling strategy for replay memory ) def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.optimizer_batch_size = config.optimizer_batch_size # Use replay memory so memory logic can be used to sample batches self.updates = int( config.batch_size / self.optimizer_batch_size) * config.epochs self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling) def create_tf_operations(self, config): """ Creates PPO training operations, i.e. the SGD update based on the trust region loss. :return: """ super(PPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): prob_ratios = list() entropy_penalties = list() kl_divergences = list() entropies = list() for name, action in self.action.items(): distribution = self.distribution[name] prev_distribution = tuple( tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(prev_distribution) self.internal_outputs.extend(distribution) self.internal_inits.extend( np.zeros(shape=util.shape(x)[1:]) for x in distribution) prev_distribution = self.distribution[ name].__class__.from_tensors( parameters=prev_distribution, deterministic=self.deterministic) shape_size = util.prod(config.actions[name].shape) # Standard policy gradient log likelihood computation log_prob = distribution.log_probability(action=action) prev_log_prob = prev_distribution.log_probability( action=action) log_prob_diff = tf.minimum(x=(log_prob - prev_log_prob), y=10.0) prob_ratio = tf.exp(x=log_prob_diff) prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, shape_size)) prob_ratios.append(prob_ratio) entropy = distribution.entropy() entropy_penalty = -config.entropy_penalty * entropy entropy_penalty = tf.reshape(tensor=entropy_penalty, shape=(-1, shape_size)) entropy_penalties.append(entropy_penalty) entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size)) entropies.append(entropy) kl_divergence = distribution.kl_divergence(prev_distribution) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size)) kl_divergences.append(kl_divergence) # The surrogate loss in PPO is the minimum of clipped loss and # target advantage * prob_ratio, which is the CPO loss # Presentation on conservative policy iteration: # https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps prob_ratio = tf.reduce_mean(input_tensor=tf.concat( values=prob_ratios, axis=1), axis=1) prob_ratio = tf.clip_by_value(prob_ratio, 1.0 - config.loss_clipping, 1.0 + config.loss_clipping) self.loss_per_instance = -prob_ratio * self.reward self.surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(self.surrogate_loss) # Mean over actions, mean over batch entropy_penalty = tf.reduce_mean(input_tensor=tf.concat( values=entropy_penalties, axis=1), axis=1) self.entropy_penalty = tf.reduce_mean(input_tensor=entropy_penalty, axis=0) tf.losses.add_loss(self.entropy_penalty) entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0) kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0) def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the trust region update based on SGD on the clipped loss. :param batch: On policy batch of experiences. :return: """ # Compute GAE. self.advantage_estimation(batch) if self.baseline: self.baseline.update(states=batch['states'], returns=batch['returns']) # Set memory contents to batch contents self.memory.set_memory(states=batch['states'], actions=batch['actions'], rewards=batch['rewards'], terminals=batch['terminals'], internals=batch['internals']) # PPO takes multiple passes over the on-policy batch. # We use a memory sampling random ranges (as opposed to keeping # track of indices and e.g. first taking elems 0-15, then 16-32, etc). for i in xrange(self.updates): self.logger.debug('Optimising PPO, update = {}'.format(i)) batch = self.memory.get_batch(self.optimizer_batch_size) fetches = [ self.optimize, self.loss, self.loss_per_instance, self.kl_divergence, self.entropy ] feed_dict = { state: batch['states'][name] for name, state in self.state.items() } feed_dict.update({ action: batch['actions'][name] for name, action in self.action.items() }) feed_dict[self.reward] = batch['rewards'] feed_dict[self.terminal] = batch['terminals'] feed_dict.update({ internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs) }) loss, loss_per_instance, kl_divergence, entropy = self.session.run( fetches=fetches, feed_dict=feed_dict)[1:5] self.logger.debug('Loss = {}'.format(loss)) self.logger.debug('KL divergence = {}'.format(kl_divergence)) self.logger.debug('Entropy = {}'.format(entropy)) return loss, loss_per_instance
def __init__( self, states_spec, actions_spec, batched_observe=1000, scope='dqfd', # parameters specific to LearningAgents summary_spec=None, network_spec=None, device=None, session_config=None, saver_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, # parameters specific to MemoryAgents batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, # parameters specific to DQFD agents target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss self.expert_margin = expert_margin self.supervised_weight = supervised_weight super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, scope=scope, # parameters specific to LearningAgent summary_spec=summary_spec, network_spec=network_spec, discount=discount, device=device, session_config=session_config, saver_spec=saver_spec, distributed_spec=distributed_spec, optimizer=optimizer, variable_noise=variable_noise, states_preprocessing_spec=states_preprocessing_spec, explorations_spec=explorations_spec, reward_preprocessing_spec=reward_preprocessing_spec, distributions_spec=distributions_spec, entropy_regularization=entropy_regularization, # parameters specific to MemoryAgents batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_memory_capacity = demo_memory_capacity self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
class QDemoModel(QModel): """ Model for deep Q-learning from demonstration. Principal structure similar to double deep Q-networks but uses additional loss terms for demo data. """ def __init__(self, states, actions, scope, device, saver, summarizer, execution, batching_capacity, variable_noise, states_preprocessing, actions_exploration, reward_preprocessing, update_mode, memory, optimizer, discount, network, distributions, entropy_regularization, target_sync_frequency, target_update_weight, double_q_model, huber_loss, expert_margin, supervised_weight, demo_memory_capacity, demo_batch_size): if any(action['type'] not in ('bool', 'int') for action in actions.values()): raise TensorForceError( "Invalid action type, only 'bool' and 'int' are valid!") self.expert_margin = expert_margin self.supervised_weight = supervised_weight self.demo_memory_capacity = demo_memory_capacity self.demo_batch_size = demo_batch_size super(QDemoModel, self).__init__(states=states, actions=actions, scope=scope, device=device, saver=saver, summarizer=summarizer, execution=execution, batching_capacity=batching_capacity, variable_noise=variable_noise, states_preprocessing=states_preprocessing, actions_exploration=actions_exploration, reward_preprocessing=reward_preprocessing, update_mode=update_mode, memory=memory, optimizer=optimizer, discount=discount, network=network, distributions=distributions, entropy_regularization=entropy_regularization, target_sync_frequency=target_sync_frequency, target_update_weight=target_update_weight, double_q_model=double_q_model, huber_loss=huber_loss) def initialize(self, custom_getter): super(QDemoModel, self).initialize(custom_getter=custom_getter) self.demo_memory = Replay(states=self.states_spec, internals=self.internals_spec, actions=self.actions_spec, include_next_states=True, capacity=self.demo_memory_capacity, scope='demo-replay', summary_labels=self.summary_labels) # Import demonstration optimization. self.fn_import_demo_experience = tf.make_template( name_='import-demo-experience', func_=self.tf_import_demo_experience, custom_getter_=custom_getter) # Demonstration loss. self.fn_demo_loss = tf.make_template(name_='demo-loss', func_=self.tf_demo_loss, custom_getter_=custom_getter) # Combined loss. self.fn_combined_loss = tf.make_template(name_='combined-loss', func_=self.tf_combined_loss, custom_getter_=custom_getter) # Demonstration optimization. self.fn_demo_optimization = tf.make_template( name_='demo-optimization', func_=self.tf_demo_optimization, custom_getter_=custom_getter) def tf_initialize(self): super(QDemoModel, self).tf_initialize() self.demo_memory.initialize() def tf_import_demo_experience(self, states, internals, actions, terminal, reward): """ Imports a single experience to memory. """ return self.demo_memory.store(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward) def tf_demo_loss(self, states, actions, terminal, reward, internals, update, reference=None): """ Extends the q-model loss via the dqfd large-margin loss. """ embedding = self.network.apply(x=states, internals=internals, update=update) deltas = list() for name, action in actions.items(): distr_params = self.distributions[name].parameterize(x=embedding) state_action_value = self.distributions[name].state_action_value( distr_params=distr_params, action=action) # Create the supervised margin loss # Zero for the action taken, one for all other actions, now multiply by expert margin if self.actions_spec[name]['type'] == 'bool': num_actions = 2 action = tf.cast(x=action, dtype=util.tf_dtype('int')) else: num_actions = self.actions_spec[name]['num_actions'] one_hot = tf.one_hot(indices=action, depth=num_actions) ones = tf.ones_like(tensor=one_hot, dtype=tf.float32) inverted_one_hot = ones - one_hot # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others state_action_values = self.distributions[name].state_action_value( distr_params=distr_params) state_action_values = state_action_values + inverted_one_hot * self.expert_margin supervised_selector = tf.reduce_max( input_tensor=state_action_values, axis=-1) # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E) delta = supervised_selector - state_action_value action_size = util.prod(self.actions_spec[name]['shape']) delta = tf.reshape(tensor=delta, shape=(-1, action_size)) deltas.append(delta) loss_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=deltas, axis=1), axis=1) loss_per_instance = tf.square(x=loss_per_instance) return tf.reduce_mean(input_tensor=loss_per_instance, axis=0) def tf_combined_loss(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): """ Combines Q-loss and demo loss. """ q_model_loss = self.fn_loss(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals, update=update, reference=reference) demo_loss = self.fn_demo_loss(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, update=update, reference=reference) return q_model_loss + self.supervised_weight * demo_loss def tf_demo_optimization(self, states, internals, actions, terminal, reward, next_states, next_internals): arguments = dict(time=self.global_timestep, variables=self.get_variables(), arguments=dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals, update=tf.constant(value=True)), fn_loss=self.fn_combined_loss) demo_optimization = self.optimizer.minimize(**arguments) arguments = self.target_optimizer_arguments() target_optimization = self.target_optimizer.minimize(**arguments) return tf.group(demo_optimization, target_optimization) def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): optimization = super(QDemoModel, self).tf_optimization( states=states, internals=internals, actions=actions, reward=reward, terminal=terminal, next_states=next_states, next_internals=next_internals) demo_batch = self.demo_memory.retrieve_timesteps( n=self.demo_batch_size) demo_optimization = self.fn_demo_optimization(**demo_batch) return tf.group(optimization, demo_optimization) def create_operations(self, states, internals, actions, terminal, reward, deterministic, independent): # Import demo experience operation. self.import_demo_experience_output = self.fn_import_demo_experience( states=states, internals=internals, actions=actions, terminal=terminal, reward=reward) # !!! super(QDemoModel, self).create_operations(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, deterministic=deterministic, independent=independent) # Demo optimization operation. demo_batch = self.demo_memory.retrieve_timesteps( n=self.demo_batch_size) self.demo_optimization_output = self.fn_demo_optimization(**demo_batch) def get_variables(self, include_submodules=False, include_nontrainable=False): """ Returns the TensorFlow variables used by the model. Returns: List of variables. """ model_variables = super(QDemoModel, self).get_variables( include_submodules=include_submodules, include_nontrainable=include_nontrainable) if include_nontrainable: demo_memory_variables = self.demo_memory.get_variables() model_variables += demo_memory_variables return model_variables def get_summaries(self): model_summaries = super(QDemoModel, self).get_summaries() demo_memory_summaries = self.demo_memory.get_summaries() return model_summaries + demo_memory_summaries def import_demo_experience(self, states, internals, actions, terminal, reward): """ Stores demonstrations in the demo memory. """ fetches = self.import_demo_experience_output feed_dict = self.get_feed_dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward) self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) def demo_update(self): """ Performs a demonstration update by calling the demo optimization operation. Note that the batch data does not have to be fetched from the demo memory as this is now part of the TensorFlow operation of the demo update. """ fetches = self.demo_optimization_output self.monitored_session.run(fetches=fetches)
class PPOModel(PolicyGradientModel): allows_discrete_actions = True allows_continuous_actions = True default_config = dict( entropy_penalty=0.01, loss_clipping=0.2, # Trust region clipping epochs=10, # Number of training epochs for SGD, optimizer_batch_size=128, # Batch size for optimiser random_sampling=True # Sampling strategy for replay memory ) def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.optimizer_batch_size = config.optimizer_batch_size # Use replay memory so memory logic can be used to sample batches if self.optimizer_batch_size > config.batch_size: raise Exception( "optimizer_batch_size > batch_size ({}, {})".format( self.optimizer_batch_size, config.batch_size)) self.updates = int( config.batch_size / self.optimizer_batch_size) * config.epochs self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling) def create_tf_operations(self, config): """ Creates PPO training operations, i.e. the SGD update based on the trust region loss. :return: """ super(PPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): prob_ratios = list() entropy_penalties = list() # for diagnostics kl_divergences = list() entropies = list() self.distribution_tensors = dict() self.prev_distribution_tensors = dict() for name, action in self.action.items(): shape_size = util.prod(config.actions[name].shape) distribution = self.distribution[name] fixed_distribution = distribution.__class__.from_tensors( tensors=[ tf.stop_gradient(x) for x in distribution.get_tensors() ], deterministic=self.deterministic) # Standard policy gradient log likelihood computation log_prob = distribution.log_probability(action=action) fixed_log_prob = fixed_distribution.log_probability( action=action) log_prob_diff = log_prob - fixed_log_prob prob_ratio = tf.exp(x=log_prob_diff) prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, shape_size)) prob_ratios.append(prob_ratio) entropy = distribution.entropy() entropy_penalty = -config.entropy_penalty * entropy entropy_penalty = tf.reshape(tensor=entropy_penalty, shape=(-1, shape_size)) entropy_penalties.append(entropy_penalty) self.distribution_tensors[name] = list( distribution.get_tensors()) prev_distribution = list( tf.placeholder(dtype=tf.float32, shape=util.shape(tensor, unknown=None)) for tensor in distribution.get_tensors()) self.prev_distribution_tensors[name] = prev_distribution prev_distribution = distribution.from_tensors( tensors=prev_distribution, deterministic=self.deterministic) kl_divergence = prev_distribution.kl_divergence( other=distribution) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size)) kl_divergences.append(kl_divergence) entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size)) entropies.append(entropy) # The surrogate loss in PPO is the minimum of clipped loss and # target advantage * prob_ratio, which is the CPO loss # Presentation on conservative policy iteration: # https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps prob_ratio = tf.reduce_mean(input_tensor=tf.concat( values=prob_ratios, axis=1), axis=1) tf.summary.histogram('prob_ratio', prob_ratio) tf.summary.scalar('mean_prob_ratio', tf.reduce_mean(input_tensor=prob_ratio, axis=0)) clipped_prob_ratio = tf.clip_by_value(prob_ratio, 1.0 - config.loss_clipping, 1.0 + config.loss_clipping) self.loss_per_instance = -tf.minimum( x=(prob_ratio * self.reward), y=(clipped_prob_ratio * self.reward)) self.surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0, name='surrogate_loss') tf.losses.add_loss(self.surrogate_loss) # Mean over actions, mean over batch entropy_penalty = tf.reduce_mean(input_tensor=tf.concat( values=entropy_penalties, axis=1), axis=1) self.entropy_penalty = tf.reduce_mean(input_tensor=entropy_penalty, axis=0, name='entropy_penalty') tf.losses.add_loss(self.entropy_penalty) kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0) tf.summary.scalar('kl_divergence', self.kl_divergence) entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0) tf.summary.scalar('entropy', self.entropy) def update(self, batch): """ Compute update for one batch of experiences using general advantage estimation and the trust region update based on SGD on the clipped loss. :param batch: On policy batch of experiences. :return: """ batch['rewards'], discounted_rewards = self.reward_estimation( states=batch['states'], rewards=batch['rewards'], terminals=batch['terminals']) if self.baseline: self.baseline.update(states=batch['states'], returns=discounted_rewards) # Set memory contents to batch contents self.memory.set_memory(states=batch['states'], actions=batch['actions'], rewards=batch['rewards'], terminals=batch['terminals'], internals=batch['internals']) # PPO takes multiple passes over the on-policy batch. # We use a memory sampling random ranges (as opposed to keeping # track of indices and e.g. first taking elems 0-15, then 16-32, etc). for i in xrange(self.updates): self.logger.debug('Optimising PPO, update = {}'.format(i)) batch = self.memory.get_batch(self.optimizer_batch_size) feed_dict = { state: batch['states'][name] for name, state in self.state.items() } feed_dict.update({ action: batch['actions'][name] for name, action in self.action.items() }) feed_dict[self.reward] = batch['rewards'] feed_dict[self.terminal] = batch['terminals'] feed_dict.update({ internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs) }) if i == 0: # First update, fetch previous distribution tensors assert self.updates >= 2 assert 'optimize' not in self.distribution_tensors fetches = dict(optimize=self.optimize) fetches.update(self.distribution_tensors) prev_distribution_tensors = self.session.run( fetches=fetches, feed_dict=feed_dict) prev_distribution_tensors.pop('optimize') elif i == self.updates - 1: # Last update, fetch return and diagnostics values fetches = [ self.optimize, self.loss, self.loss_per_instance, self.kl_divergence, self.entropy ] prev_distribution_tensors = { placeholder: tensor for name, placeholders in self.prev_distribution_tensors.items() for placeholder, tensor in zip( placeholders, prev_distribution_tensors[name]) } feed_dict.update(prev_distribution_tensors) with SummarySessionWrapper(self, fetches, feed_dict) as session: _, loss, loss_per_instance, kl_divergence, entropy = session.run( ) else: # Otherwise just optimize self.session.run(fetches=self.optimize, feed_dict=feed_dict) return loss, loss_per_instance
def __init__(self, states_spec, actions_spec, network_spec, device=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: actions_spec: network_spec: device: scope: saver_spec: summary_spec: distributed_spec: optimizer: discount: normalize_rewards: variable_noise: distributions_spec: entropy_regularization: target_sync_frequency: target_update_weight: double_q_model: huber_loss: preprocessing: exploration: reward_preprocessing: batched_observe: batch_size: memory: first_update: update_frequency: repeat_update: expert_margin: supervised_weight: demo_memory_capacity: demo_sampling_ratio: """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `device`: string of tensorflow device name. * `tf_summary`: string directory to write tensorflow summaries. Default None * `tf_summary_level`: int indicating which tensorflow summaries to create. * `tf_summary_interval`: int number of calls to get_action until writing tensorflow summaries on update. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound """ def __init__(self, states_spec, actions_spec, network_spec, device=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: actions_spec: network_spec: device: scope: saver_spec: summary_spec: distributed_spec: optimizer: discount: normalize_rewards: variable_noise: distributions_spec: entropy_regularization: target_sync_frequency: target_update_weight: double_q_model: huber_loss: preprocessing: exploration: reward_preprocessing: batched_observe: batch_size: memory: first_update: update_frequency: repeat_update: expert_margin: supervised_weight: demo_memory_capacity: demo_sampling_ratio: """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity) def initialize_model(self, states_spec, actions_spec): return QDemoModel( states_spec=states_spec, actions_spec=actions_spec, network_spec=self.network_spec, device=self.device, scope=self.scope, saver_spec=self.saver_spec, summary_spec=self.summary_spec, distributed_spec=self.distributed_spec, optimizer=self.optimizer, discount=self.discount, normalize_rewards=self.normalize_rewards, variable_noise=self.variable_noise, distributions_spec=self.distributions_spec, entropy_regularization=self.entropy_regularization, target_sync_frequency=self.target_sync_frequency, target_update_weight=self.target_update_weight, double_q_model=self.double_q_model, huber_loss=self.huber_loss, # TEMP: Random sampling fix random_sampling_fix=True, expert_margin=self.expert_margin, supervised_weight=self.supervised_weight) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internals'], actions=action, terminal=observation['terminal'], reward=observation['reward']) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory(states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def pretrain(self, steps): """ Computes pretrain updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data via an additional supervised loss term. """ def __init__( self, states_spec, actions_spec, batched_observe=1000, scope='dqfd', # parameters specific to LearningAgents summary_spec=None, network_spec=None, device=None, session_config=None, saver_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, # parameters specific to MemoryAgents batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, # parameters specific to DQFD agents target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss self.expert_margin = expert_margin self.supervised_weight = supervised_weight super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, scope=scope, # parameters specific to LearningAgent summary_spec=summary_spec, network_spec=network_spec, discount=discount, device=device, session_config=session_config, saver_spec=saver_spec, distributed_spec=distributed_spec, optimizer=optimizer, variable_noise=variable_noise, states_preprocessing_spec=states_preprocessing_spec, explorations_spec=explorations_spec, reward_preprocessing_spec=reward_preprocessing_spec, distributions_spec=distributions_spec, entropy_regularization=entropy_regularization, # parameters specific to MemoryAgents batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_memory_capacity = demo_memory_capacity self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity) def initialize_model(self): return QDemoModel( states_spec=self.states_spec, actions_spec=self.actions_spec, network_spec=self.network_spec, device=self.device, session_config=self.session_config, scope=self.scope, saver_spec=self.saver_spec, summary_spec=self.summary_spec, distributed_spec=self.distributed_spec, optimizer=self.optimizer, discount=self.discount, variable_noise=self.variable_noise, states_preprocessing_spec=self.states_preprocessing_spec, explorations_spec=self.explorations_spec, reward_preprocessing_spec=self.reward_preprocessing_spec, distributions_spec=self.distributions_spec, entropy_regularization=self.entropy_regularization, target_sync_frequency=self.target_sync_frequency, target_update_weight=self.target_update_weight, # DQFD always uses double dqn, which is a required key for a q-model. double_q_model=True, huber_loss=self.huber_loss, # TEMP: Random sampling fix random_sampling_fix=True, expert_margin=self.expert_margin, supervised_weight=self.supervised_weight) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch( batch_size=self.demo_batch_size, next_states=True) self.model.demonstration_update(states={ name: np.stack( (batch['states'][name], batch['next_states'][name])) for name in batch['states'] }, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internals'], actions=action, terminal=observation['terminal'], reward=observation['reward']) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory(states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def pretrain(self, steps): """ Computes pre-train updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(states={ name: np.stack( (batch['states'][name], batch['next_states'][name])) for name in batch['states'] }, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'])
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `device`: string of tensorflow device name. * `tf_summary`: string directory to write tensorflow summaries. Default None * `tf_summary_level`: int indicating which tensorflow summaries to create. * `tf_summary_interval`: int number of calls to get_action until writing tensorflow summaries on update. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound """ default_config = dict( # Agent preprocessing=None, exploration=None, reward_preprocessing=None, # Model optimizer=dict( type='adam', learning_rate=1e-3 ), discount=0.99, normalize_rewards=False, variable_noise=None, # not documented!!! # DistributionModel distributions=None, # not documented!!! entropy_regularization=None, # QModel target_sync_frequency=10000, # not documented!!! target_update_weight=1.0, # not documented!!! huber_loss=0.0, # not documented!!! # Logging log_level='info', model_directory=None, save_frequency=600, # TensorFlow default summary_labels=['total-loss'], summary_frequency=120, # TensorFlow default # TensorFlow distributed configuration cluster_spec=None, parameter_server=False, task_index=0, device=None, local_model=False, replica_model=False, scope='dqfd' ) def __init__(self, states_spec, actions_spec, network_spec, config): self.network_spec = network_spec config = config.copy() config.default(DQFDAgent.default_config) # DQFD always uses double dqn, which is a required key for a q-model. config.obligatory(double_dqn=True) self.target_update_frequency = config.target_update_frequency self.demo_memory_capacity = config.demo_memory_capacity # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec) super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, config=config ) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internal'], actions=action, terminal=observation['terminal'], reward=observation['reward'] ) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory( states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'] ) def initialize_model(self, states_spec, actions_spec, config): return QDemoModel( states_spec=states_spec, actions_spec=actions_spec, network_spec=self.network_spec, config=config ) def pretrain(self, steps): """ Computes pretrain updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data via an additional supervised loss term. """ def __init__( self, states_spec, actions_spec, network_spec, device=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2 ): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. discount: Float specifying reward discount factor. normalize_rewards: Boolean flag specifying whether to normalize rewards, default False. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each preprocessor is a dict containing a type and optional necessary arguments. exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise) and arguments. reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size. memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`. first_update: Int describing at which time step the first update is performed. Should be larger than batch size. update_frequency: Int specifying number of observe steps to perform until an update is executed. repeat_update: Int specifying how many update steps are performed per update, where each update step implies sampling a batch from the memory and passing it to the model. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict( type='adam', learning_rate=1e-3 ) else: self.optimizer = optimizer if memory is None: memory = dict( type='replay', capacity=100000 ) else: self.memory = memory self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update ) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity) def initialize_model(self, states_spec, actions_spec): return QDemoModel( states_spec=states_spec, actions_spec=actions_spec, network_spec=self.network_spec, device=self.device, scope=self.scope, saver_spec=self.saver_spec, summary_spec=self.summary_spec, distributed_spec=self.distributed_spec, optimizer=self.optimizer, discount=self.discount, normalize_rewards=self.normalize_rewards, variable_noise=self.variable_noise, distributions_spec=self.distributions_spec, entropy_regularization=self.entropy_regularization, target_sync_frequency=self.target_sync_frequency, target_update_weight=self.target_update_weight, double_q_model=self.double_q_model, huber_loss=self.huber_loss, # TEMP: Random sampling fix random_sampling_fix=True, expert_margin=self.expert_margin, supervised_weight=self.supervised_weight ) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(batch_size=self.demo_batch_size, next_states=True) self.model.demonstration_update( states={name: np.stack((batch['states'][name], batch['next_states'][name])) for name in batch['states']}, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'] ) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internals'], actions=action, terminal=observation['terminal'], reward=observation['reward'] ) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory( states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'] ) def pretrain(self, steps): """ Computes pretrain updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update( states={name: np.stack((batch['states'][name], batch['next_states'][name])) for name in batch['states']}, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'] )
def __init__(self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. discount: Float specifying reward discount factor. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). states_preprocessing_spec: Optional list of states preprocessors to apply to state (e.g. `image_resize`, `grayscale`). explorations_spec: Optional dict specifying action exploration type (epsilon greedy or Gaussian noise). reward_preprocessing_spec: Optional dict specifying reward preprocessing. distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size. memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`. first_update: Int describing at which time step the first update is performed. Should be larger than batch size. update_frequency: Int specifying number of observe steps to perform until an update is executed. repeat_update: Int specifying how many update steps are performed per update, where each update step implies sampling a batch from the memory and passing it to the model. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.variable_noise = variable_noise self.states_preprocessing_spec = states_preprocessing_spec self.explorations_spec = explorations_spec self.reward_preprocessing_spec = reward_preprocessing_spec self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `device`: string of tensorflow device name. * `tf_summary`: boolean indicating whether to use tensorflow summary file writer. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound """ name = 'DQFDAgent' model = DQFDModel default_config = dict(target_update_frequency=10000, demo_memory_capacity=1000000, demo_sampling_ratio=0.01) def __init__(self, config, model=None): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config, model) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format( self.demo_batch_size) def observe(self, reward, terminal): """Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: Returns: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) if self.timestep >= self.first_update and self.timestep % self.target_update_frequency == 0: self.model.update_target() def import_demonstrations(self, demonstrations): """Imports demonstrations, i.e. expert observations Args: demonstrations: Returns: """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['state']) else: state = observation['state'] if self.unique_action: action = dict(action=observation['action']) else: action = observation['action'] self.demo_memory.add_observation(state=state, action=action, reward=observation['reward'], terminal=observation['terminal'], internal=observation['internal']) def pretrain(self, steps): """Computes pretrain updates. Args: steps: Number of updates to execute. Returns: """ for _ in xrange(steps): # Sample from demo memory batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `optimizer_args`: list of arguments for optimizer. * `optimizer_kwargs`: dict of keyword arguments for optimizer. * `device`: string of tensorflow device name. * `tf_saver`: boolean whether to save model parameters. * `tf_summary`: boolean indicating whether to use tensorflow summary file writer. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `memory_args`: list of arguments to pass to replay memory constructor. * `memory_kwargs`: list of keyword arguments to pass to replay memory constructor. * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_gradients`: float of maximum values for gradients before clipping. """ name = 'DQFDAgent' model = DQFDModel default_config = dict( target_update_frequency=10000, demo_memory_capacity=1000000, demo_sampling_ratio=0.01 ) def __init__(self, config): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive.' \ '(Calculated {} based on current parameters)'.format(self.demo_batch_size) def observe(self, reward, terminal): """Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: Returns: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) if self.timestep >= self.first_update and self.timestep % self.target_update_frequency == 0: self.model.update_target() def import_demonstrations(self, demonstrations): """Imports demonstrations, i.e. expert observations Args: demonstrations: Returns: """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['state']) else: state = observation['state'] if self.unique_action: action = dict(action=observation['action']) else: action = observation['action'] self.demo_memory.add_observation( state=state, action=action, reward=observation['reward'], terminal=observation['terminal'], internal=observation['internal'] ) def pretrain(self, steps): """Computes pretrain updates. Args: steps: Number of updates to execute. Returns: """ for _ in xrange(steps): # Sample from demo memory batch = self.demo_memory.get_batch(self.batch_size) # Update using both double Q-learning and supervised double_q_loss self.model.demonstration_update(batch)