Пример #1
0
    def __init__(self, config, scope='memory_agent', network_builder=None):
        """
        Initialize a vanilla DQN agent as described in
        http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html.

        :param config: Configuration parameters for agent
        :param scope: TensorFlow scope
        """
        self.config = create_config(config, default=self.default_config)
        self.model = None

        self.memory = ReplayMemory(**self.config)
        self.step_count = 0
        self.update_repeat = self.config.update_repeat
        self.batch_size = self.config.batch_size
        self.update_steps = int(round(1 / self.config.update_rate))
        self.use_target_network = self.config.use_target_network

        if self.use_target_network:
            self.target_update_steps = int(
                round(1 / self.config.target_network_update_rate))

        self.min_replay_size = self.config.min_replay_size

        if self.__class__.model:
            self.model = self.__class__.model(self.config,
                                              scope,
                                              network_builder=network_builder)
Пример #2
0
    def __init__(self, config, scope='dqfd_agent', network_builder=None):
        """
        
        :param config: 
        :param scope: 
        """
        self.config = create_config(config, default=self.default_config)

        # This is the online memory
        self.replay_memory = ReplayMemory(**self.config)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        # TODO we might want different sizes for these memories -> add config param
        self.demo_memory = ReplayMemory(**self.config)

        self.step_count = 0

        # Called p in paper, controls ratio of expert vs online training samples
        self.expert_sampling_ratio = self.config.expert_sampling_ratio

        self.update_repeat = self.config.update_repeat
        self.batch_size = self.config.batch_size

        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = int(self.expert_sampling_ratio * self.batch_size / \
                               (1.0 - self.expert_sampling_ratio))
        self.update_steps = int(round(1 / self.config.update_rate))
        self.use_target_network = self.config.use_target_network

        if self.use_target_network:
            self.target_update_steps = int(
                round(1 / self.config.target_network_update_rate))

        self.min_replay_size = self.config.min_replay_size

        if self.__class__.model:
            self.model = self.__class__.model(self.config,
                                              scope,
                                              network_builder=network_builder)
Пример #3
0
class MemoryAgent(RLAgent):

    name = 'MemoryAgent'

    default_config = {
        'batch_size': 32,
        'update_rate': 0.25,
        'target_network_update_rate': 0.0001,
        'min_replay_size': 5e4,
        'deterministic_mode': False,
        'use_target_network': False,
        'update_repeat': 1
    }

    model = None

    def __init__(self, config, scope='memory_agent', network_builder=None):
        """
        Initialize a vanilla DQN agent as described in
        http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html.

        :param config: Configuration parameters for agent
        :param scope: TensorFlow scope
        """
        self.config = create_config(config, default=self.default_config)
        self.model = None

        self.memory = ReplayMemory(**self.config)
        self.step_count = 0
        self.update_repeat = self.config.update_repeat
        self.batch_size = self.config.batch_size
        self.update_steps = int(round(1 / self.config.update_rate))
        self.use_target_network = self.config.use_target_network

        if self.use_target_network:
            self.target_update_steps = int(
                round(1 / self.config.target_network_update_rate))

        self.min_replay_size = self.config.min_replay_size

        if self.__class__.model:
            self.model = self.__class__.model(self.config,
                                              scope,
                                              network_builder=network_builder)

    def setup(self):
        """
        Prepares the agent to run

        :return:
        """
        self.model.initialize()

    def update(self, batch):
        """
        Explicitly calls update using the provided batch of experiences.

        :param batch:
        :return:
        """
        self.model.update(batch)

    def get_action(self, *args, **kwargs):
        """
        Executes one reinforcement learning step.

        :return: Which action to take
        """
        action = self.model.get_action(*args, **kwargs)

        return action

    def add_observation(self, state, action, reward, terminal):
        """
        Adds an observation for training purposes. Implicitly computes updates
        according to the update frequency.

        :param state: State observed
        :param action: Action taken in state
        :param reward: Reward observed
        :param terminal: Indicates terminal state
        """
        self.memory.add_experience(state, action, reward, terminal)

        self.step_count += 1

        if self.step_count >= self.min_replay_size and self.step_count % self.update_steps == 0:
            for _ in xrange(self.update_repeat):
                batch = self.memory.sample_batch(self.batch_size)
                self.model.update(batch)

        if self.step_count >= self.min_replay_size and self.use_target_network \
                and self.step_count % self.target_update_steps == 0:
            self.model.update_target_network()

    def get_variables(self):
        return self.model.get_variables()

    def assign_variables(self, values):
        self.model.assign_variables(values)

    def get_gradients(self):
        return self.model.get_gradients()

    def apply_gradients(self, grads_and_vars):
        self.model.apply_gradients(grads_and_vars)

    def save_model(self, path):
        self.model.save_model(path)

    def load_model(self, path):
        self.model.load_model(path)
Пример #4
0
class DQFDAgent(RLAgent):
    model = None

    default_config = {
        'batch_size': 32,
        'update_rate': 0.25,
        'target_network_update_rate': 0.0001,
        'min_replay_size': 5e4,
        'deterministic_mode': False,
        'use_target_network': False,
        'update_repeat': 1
    }

    def __init__(self, config, scope='dqfd_agent'):
        """
        
        :param config: 
        :param scope: 
        """
        self.config = create_config(config, default=self.default_config)
        self.model = None

        # This is the online memory
        self.replay_memory = ReplayMemory(**self.config)

        # This is the demonstration memory that we will fill with observations before starting
        # the main training loop
        # TODO we might want different sizes for these memories -> add config param
        self.demo_memory = ReplayMemory(**self.config)

        self.step_count = 0

        # Called p in paper, controls ratio of expert vs online training samples
        self.expert_sampling_ratio = self.config.expert_sampling_ratio

        # p = n_demo / (n_demo + n_replay) => n_demo  = p * n_replay / (1 - p)
        self.demo_batch_size = self.expert_sampling_ratio * self.batch_size / \
                               (1.0 - self.expert_sampling_ratio)

        self.update_repeat = self.config.update_repeat
        self.batch_size = self.config.batch_size
        self.update_steps = int(round(1 / self.config.update_rate))
        self.use_target_network = self.config.use_target_network

        if self.use_target_network:
            self.target_update_steps = int(
                round(1 / self.config.target_network_update_rate))

        self.min_replay_size = self.config.min_replay_size

        if self.__class__.model:
            self.model = self.__class__.model(self.config, scope)

    def add_demo_observation(self, state, action, reward, terminal):
        """
        Adds observations to demo memory. 

        """
        self.demo_memory.add_experience(state, action, reward, terminal)

    def pretrain(self, steps=1):
        """
        
        :param steps: Number of pre-train updates to perform.
        
        """
        for _ in xrange(steps):
            # Sample from demo memory
            batch = self.demo_memory.sample_batch(self.batch_size)

            # Update using both double Q-learning and supervised double_q_loss
            self.model.pretrain_update(batch)

    def add_observation(self, state, action, reward, terminal):
        """
        Adds observations, updates via sampling from memories according to update rate.
        In the DQFD case, we sample from the online replay memory and the demo memory with
        the fractions controlled by a hyperparameter p called 'expert sampling ratio.
        
        :param state: 
        :param action: 
        :param reward: 
        :param terminal: 
        :return: 
        """
        self.replay_memory.add_experience(state, action, reward, terminal)

        self.step_count += 1

        if self.step_count >= self.min_replay_size and self.step_count % self.update_steps == 0:
            for _ in xrange(self.update_repeat):
                # Sample batches according to expert sampling ratio
                # In the paper, p is given as p = n_demo / (n_replay + n_demo)
                demo_batch = self.demo_memory.sample_batch(
                    self.demo_batch_size)
                online_batch = self.demo_memory.sample_batch(self.batch_size)

                self.model.update(demo_batch, online_batch)

        if self.step_count >= self.min_replay_size and self.use_target_network \
                and self.step_count % self.target_update_steps == 0:
            self.model.update_target_network()

    def get_action(self, *args, **kwargs):
        """
        Get action from model, as in DQN.
        
        :param state: 
        """

        action = self.model.get_action(*args, **kwargs)

        return action

    def save_model(self, path):
        self.model.save_model(path)

    def load_model(self, path):
        self.model.load_model(path)
Пример #5
0
def test_replay_memory():
    """
    Testing replay memory.
    """
    capacity = np.random.randint(5, 8)
    batch_size = np.random.randint(capacity)

    state_shape = tuple(np.random.randint(1, 4, size=2))
    action_shape = (4, )

    memory = ReplayMemory(capacity, state_shape, action_shape)

    states = []
    actions = []
    rewards = []
    terminals = []

    def sample_observation():
        while True:
            state = np.random.randint(0, 255, size=state_shape)
            if len(states) > 0:
                if not np.all(
                        np.any(np.array(states) - np.array(state), axis=1)):
                    # avoid duplicate states
                    continue
            break

        action = np.random.randint(4)
        reward = np.random.choice(2, 1, p=[0.7, 0.3])
        terminal = np.random.choice(2, 1, p=[0.9, 0.1])

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        terminals.append(terminal)

        memory.add_experience(state, action, reward, terminal)

        return state, action, reward, terminal

    for i in xrange(capacity):
        state, action, reward, terminal = sample_observation()

    assert not np.any(np.array(memory.states) - np.array(states))

    state, action, reward, terminal = sample_observation()

    assert not np.any(np.array(memory.states[0]) - np.array(state))

    for i in xrange(capacity - 1):
        state, action, reward, terminal = sample_observation()

    assert not np.any(np.array(memory.states) - np.array(states[-capacity:]))

    batch = memory.sample_batch(batch_size)
    exp = zip(list(batch['states']), batch['actions'], batch['rewards'],
              batch['terminals'], batch['next_states'])

    # Warning: since we're testing a random batch, some of the following assertions could be True by coincidence
    # In this test, states are unique, so we can just compare state tensors with each other

    for i in xrange(100):
        first_state = states[0]
        last_state = states[-1]
        for (state, action, reward, terminal, next_state) in exp:
            # last state must not be in experiences, as it has no next state
            assert np.all(np.any(state - last_state, axis=1))

            # first state must not be in next_states, as it has no previous state
            assert np.all(np.any(next_state - first_state, axis=1))