def __init__(self, config, scope='memory_agent', network_builder=None): """ Initialize a vanilla DQN agent as described in http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html. :param config: Configuration parameters for agent :param scope: TensorFlow scope """ self.config = create_config(config, default=self.default_config) self.model = None self.memory = ReplayMemory(**self.config) self.step_count = 0 self.update_repeat = self.config.update_repeat self.batch_size = self.config.batch_size self.update_steps = int(round(1 / self.config.update_rate)) self.use_target_network = self.config.use_target_network if self.use_target_network: self.target_update_steps = int( round(1 / self.config.target_network_update_rate)) self.min_replay_size = self.config.min_replay_size if self.__class__.model: self.model = self.__class__.model(self.config, scope, network_builder=network_builder)
def __init__(self, config, scope='dqfd_agent', network_builder=None): """ :param config: :param scope: """ self.config = create_config(config, default=self.default_config) # This is the online memory self.replay_memory = ReplayMemory(**self.config) # This is the demonstration memory that we will fill with observations before starting # the main training loop # TODO we might want different sizes for these memories -> add config param self.demo_memory = ReplayMemory(**self.config) self.step_count = 0 # Called p in paper, controls ratio of expert vs online training samples self.expert_sampling_ratio = self.config.expert_sampling_ratio self.update_repeat = self.config.update_repeat self.batch_size = self.config.batch_size # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(self.expert_sampling_ratio * self.batch_size / \ (1.0 - self.expert_sampling_ratio)) self.update_steps = int(round(1 / self.config.update_rate)) self.use_target_network = self.config.use_target_network if self.use_target_network: self.target_update_steps = int( round(1 / self.config.target_network_update_rate)) self.min_replay_size = self.config.min_replay_size if self.__class__.model: self.model = self.__class__.model(self.config, scope, network_builder=network_builder)
class MemoryAgent(RLAgent): name = 'MemoryAgent' default_config = { 'batch_size': 32, 'update_rate': 0.25, 'target_network_update_rate': 0.0001, 'min_replay_size': 5e4, 'deterministic_mode': False, 'use_target_network': False, 'update_repeat': 1 } model = None def __init__(self, config, scope='memory_agent', network_builder=None): """ Initialize a vanilla DQN agent as described in http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html. :param config: Configuration parameters for agent :param scope: TensorFlow scope """ self.config = create_config(config, default=self.default_config) self.model = None self.memory = ReplayMemory(**self.config) self.step_count = 0 self.update_repeat = self.config.update_repeat self.batch_size = self.config.batch_size self.update_steps = int(round(1 / self.config.update_rate)) self.use_target_network = self.config.use_target_network if self.use_target_network: self.target_update_steps = int( round(1 / self.config.target_network_update_rate)) self.min_replay_size = self.config.min_replay_size if self.__class__.model: self.model = self.__class__.model(self.config, scope, network_builder=network_builder) def setup(self): """ Prepares the agent to run :return: """ self.model.initialize() def update(self, batch): """ Explicitly calls update using the provided batch of experiences. :param batch: :return: """ self.model.update(batch) def get_action(self, *args, **kwargs): """ Executes one reinforcement learning step. :return: Which action to take """ action = self.model.get_action(*args, **kwargs) return action def add_observation(self, state, action, reward, terminal): """ Adds an observation for training purposes. Implicitly computes updates according to the update frequency. :param state: State observed :param action: Action taken in state :param reward: Reward observed :param terminal: Indicates terminal state """ self.memory.add_experience(state, action, reward, terminal) self.step_count += 1 if self.step_count >= self.min_replay_size and self.step_count % self.update_steps == 0: for _ in xrange(self.update_repeat): batch = self.memory.sample_batch(self.batch_size) self.model.update(batch) if self.step_count >= self.min_replay_size and self.use_target_network \ and self.step_count % self.target_update_steps == 0: self.model.update_target_network() def get_variables(self): return self.model.get_variables() def assign_variables(self, values): self.model.assign_variables(values) def get_gradients(self): return self.model.get_gradients() def apply_gradients(self, grads_and_vars): self.model.apply_gradients(grads_and_vars) def save_model(self, path): self.model.save_model(path) def load_model(self, path): self.model.load_model(path)
class DQFDAgent(RLAgent): model = None default_config = { 'batch_size': 32, 'update_rate': 0.25, 'target_network_update_rate': 0.0001, 'min_replay_size': 5e4, 'deterministic_mode': False, 'use_target_network': False, 'update_repeat': 1 } def __init__(self, config, scope='dqfd_agent'): """ :param config: :param scope: """ self.config = create_config(config, default=self.default_config) self.model = None # This is the online memory self.replay_memory = ReplayMemory(**self.config) # This is the demonstration memory that we will fill with observations before starting # the main training loop # TODO we might want different sizes for these memories -> add config param self.demo_memory = ReplayMemory(**self.config) self.step_count = 0 # Called p in paper, controls ratio of expert vs online training samples self.expert_sampling_ratio = self.config.expert_sampling_ratio # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = self.expert_sampling_ratio * self.batch_size / \ (1.0 - self.expert_sampling_ratio) self.update_repeat = self.config.update_repeat self.batch_size = self.config.batch_size self.update_steps = int(round(1 / self.config.update_rate)) self.use_target_network = self.config.use_target_network if self.use_target_network: self.target_update_steps = int( round(1 / self.config.target_network_update_rate)) self.min_replay_size = self.config.min_replay_size if self.__class__.model: self.model = self.__class__.model(self.config, scope) def add_demo_observation(self, state, action, reward, terminal): """ Adds observations to demo memory. """ self.demo_memory.add_experience(state, action, reward, terminal) def pretrain(self, steps=1): """ :param steps: Number of pre-train updates to perform. """ for _ in xrange(steps): # Sample from demo memory batch = self.demo_memory.sample_batch(self.batch_size) # Update using both double Q-learning and supervised double_q_loss self.model.pretrain_update(batch) def add_observation(self, state, action, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. In the DQFD case, we sample from the online replay memory and the demo memory with the fractions controlled by a hyperparameter p called 'expert sampling ratio. :param state: :param action: :param reward: :param terminal: :return: """ self.replay_memory.add_experience(state, action, reward, terminal) self.step_count += 1 if self.step_count >= self.min_replay_size and self.step_count % self.update_steps == 0: for _ in xrange(self.update_repeat): # Sample batches according to expert sampling ratio # In the paper, p is given as p = n_demo / (n_replay + n_demo) demo_batch = self.demo_memory.sample_batch( self.demo_batch_size) online_batch = self.demo_memory.sample_batch(self.batch_size) self.model.update(demo_batch, online_batch) if self.step_count >= self.min_replay_size and self.use_target_network \ and self.step_count % self.target_update_steps == 0: self.model.update_target_network() def get_action(self, *args, **kwargs): """ Get action from model, as in DQN. :param state: """ action = self.model.get_action(*args, **kwargs) return action def save_model(self, path): self.model.save_model(path) def load_model(self, path): self.model.load_model(path)
def test_replay_memory(): """ Testing replay memory. """ capacity = np.random.randint(5, 8) batch_size = np.random.randint(capacity) state_shape = tuple(np.random.randint(1, 4, size=2)) action_shape = (4, ) memory = ReplayMemory(capacity, state_shape, action_shape) states = [] actions = [] rewards = [] terminals = [] def sample_observation(): while True: state = np.random.randint(0, 255, size=state_shape) if len(states) > 0: if not np.all( np.any(np.array(states) - np.array(state), axis=1)): # avoid duplicate states continue break action = np.random.randint(4) reward = np.random.choice(2, 1, p=[0.7, 0.3]) terminal = np.random.choice(2, 1, p=[0.9, 0.1]) states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) memory.add_experience(state, action, reward, terminal) return state, action, reward, terminal for i in xrange(capacity): state, action, reward, terminal = sample_observation() assert not np.any(np.array(memory.states) - np.array(states)) state, action, reward, terminal = sample_observation() assert not np.any(np.array(memory.states[0]) - np.array(state)) for i in xrange(capacity - 1): state, action, reward, terminal = sample_observation() assert not np.any(np.array(memory.states) - np.array(states[-capacity:])) batch = memory.sample_batch(batch_size) exp = zip(list(batch['states']), batch['actions'], batch['rewards'], batch['terminals'], batch['next_states']) # Warning: since we're testing a random batch, some of the following assertions could be True by coincidence # In this test, states are unique, so we can just compare state tensors with each other for i in xrange(100): first_state = states[0] last_state = states[-1] for (state, action, reward, terminal, next_state) in exp: # last state must not be in experiences, as it has no next state assert np.all(np.any(state - last_state, axis=1)) # first state must not be in next_states, as it has no previous state assert np.all(np.any(next_state - first_state, axis=1))