예제 #1
0
    def __init__(self,
                 capacity=100000,
                 priority_fraction=0.0,
                 discount_gamma_game_reward=1.0,
                 discount_gamma_graph_reward=1.0,
                 discount_gamma_count_reward=1.0,
                 accumulate_reward_from_final=False):
        # prioritized replay memory
        self._storage = []
        self.capacity = capacity
        self._next_idx = 0

        assert priority_fraction >= 0
        self._alpha = priority_fraction

        it_capacity = 1
        while it_capacity < capacity:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self.discount_gamma_game_reward = discount_gamma_game_reward
        self.discount_gamma_graph_reward = discount_gamma_graph_reward
        self.discount_gamma_count_reward = discount_gamma_count_reward
        self.accumulate_reward_from_final = accumulate_reward_from_final
예제 #2
0
    def __init__(self, memory_size=1000000, alpha=0.5, seed=None):
        '''
        Prioritized replay buffer from https://arxiv.org/pdf/1511.05952.pdf
        This implementation is based on the OpenAI sumtree implemenation which can be found here
        https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
        
        memory_size: int
            maximum number of experiences to store
            
        alpha: float, [0.0, 1.0]
            hyperparameter that controls the amount of prioritization, with 0.0 being 
            no prioritization (the uniform case)
            
        seed: None or int
            random seed for the replay buffer
        '''
        super().__init__(memory_size=memory_size, seed=seed)
        self.alpha = alpha

        it_capacity = 1
        while it_capacity < self._memory_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #3
0
    def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"):
        """Initialize a PrioritizedReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization)
            beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction)
        """
        super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device)

        self.alpha = alpha
        self.beta = beta
        self._eps = 0.00000001

        it_capacity = 1
        while it_capacity < buffer_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #4
0
    def __init__(
        self,
        obs_dim: list,
        size: int,
        device: str,
        batch_size: int = 32,
        alpha: float = 0.6,
        n_step: int = 1,
        gamma: float = 0.99,
    ):
        """Initialization."""
        assert alpha >= 0

        super(PrioritizedReplayBuffer,
              self).__init__(obs_dim, size, device, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
예제 #5
0
    def __init__(self, action_size, buffer_size, batch_size, alpha):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            alpha (float): alpha PER value 
        """
        self.max_priority = 1.0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        self.tree_capacity = 1
        while self.tree_capacity < buffer_size:
            self.tree_capacity *= 2

        self.sum_tree = SumSegmentTree(self.tree_capacity)
        self.min_tree = MinSegmentTree(self.tree_capacity)

        self.action_size = action_size
        self.memory = []
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #7
0
    def __init__(self, learner_config, env_config, session_config):
        """
        Create Prioritized Replay buffer.
        :param size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        :param alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)
        """
        super(PrioritizedReplayBuffer,
              self).__init__(learner_config=learner_config,
                             env_config=env_config,
                             session_config=session_config)

        self._alpha = self.replay_config.alpha
        assert self._alpha > 0

        self._memory = []
        self.memory_size = self.replay_config.memory_size
        it_capacity = 1
        while it_capacity < self.memory_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #8
0
    def __init__(self, size, alpha=0.6):
        """Create Prioritized Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)
        See Also
        --------
        ReplayBuffer.__init__
        """
        #super(PrioritizedReplayBuffer, self).__init__(size)

        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.it_capacity = 1
        while self.it_capacity < size * 2:  # We use double the soft capacity of the PER for the segment trees to allow for any overflow over the soft capacity limit before samples are removed
            self.it_capacity *= 2

        self._it_sum = SumSegmentTree(self.it_capacity)
        self._it_min = MinSegmentTree(self.it_capacity)
        self._max_priority = 1.0
예제 #9
0
 def __init__(self, capacity, gamma=0.99, n_steps=2, alpha=0.5):
     super(PriorityBuffer, self).__init__(capacity, gamma, n_steps)
     self.buffer = []
     self.position = 0
     self.alpha = alpha
     it_cap = 1
     while it_cap < capacity:
         it_cap *= 2
     self._it_sum = SumSegmentTree(it_cap)
     self._it_min = MinSegmentTree(it_cap)
     self._max_priority = 1.0
 def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6):
     super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed)
     
     #capacity must be positive and a power of 2
     tree_capacity = 1
     while tree_capacity < self.buffer_size:
         tree_capacity *= 2
     self.sum_tree = SumSegmentTree(tree_capacity)
     self.min_tree = MinSegmentTree(tree_capacity)
     self.max_priority, self.tree_ptr = 1.0, 0
     self.alpha = alpha
예제 #11
0
파일: memory.py 프로젝트: DataScenic/DQN
    def __init__(self, size, alpha):
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #12
0
    def __init__(self, size, alpha):
        super().__init__(size, N_Step_Transition)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #13
0
    def __init__(self, obs_space, action_space, capacity, exponent, device, optimize_memory_usage=False):
        super().__init__(obs_space, action_space, capacity, device,
                         optimize_memory_usage=optimize_memory_usage)
        assert exponent >= 0
        self.exponent = exponent

        it_capacity = 1
        while it_capacity < self.capacity:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #14
0
    def __init__(self, size, alpha):
        super(ProportionalReplay, self).__init__(size)
        assert alpha >= 0
        self.alpha = alpha

        self.tree_size = 1
        while self.tree_size < self.maxsize:
            self.tree_size *= 2

        self.min_tree = MinSegmentTree(
            self.tree_size)  # for calculating maximum IS weight
        self.sum_tree = SumSegmentTree(
            self.tree_size)  # for proportional sampling
        self.max_priority = 1.0  # maximum priority we've seen so far. will be updated
예제 #15
0
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0
예제 #16
0
    def __init__(self, buffer_size, input_dim, batch_size, alpha):

        super(PrioritizedReplayBuffer, self).__init__(buffer_size, input_dim,
                                                      batch_size)

        # For PER. Parameter settings.
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        tree_capacity = 1
        while tree_capacity < self.buffer_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
    def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes, lock):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_losses = []
        self.episode_mean_values = []
        self.lock = lock
        it_capacity = 1
        while it_capacity < max_memory:
            it_capacity *= 2
        self._it_sum = [SumSegmentTree(it_capacity)]
        self._it_min = [MinSegmentTree(it_capacity)]
        self.pre_t_m_loss = 1e5
        self.unpermit = True

        # self.replaymemory = ReplayMemory(max_memory)
        global worker_num
        # self.local_AC = AC_Network(sess, s_size, a_size, self.name, None)
        self.local_AC = AC_Network(sess, s_size, a_size, self.name, self.trainer, self._it_sum, self._it_min)
        worker_num += 1
        self.update_local_ops = update_target_graph(self.local_AC.target_scope, self.name)
        self.update_to_global_ops = update_target_graph(self.name, "worker_" + str(num_workers))

        self.update_ops = [[update_target_graph('worker_' + str(i), 'worker_' + str(j)) for j in range(num_workers + 1)]
                           for i in range(num_workers + 1)]
        self.update_part_ops = [
            [update_target_graph_part('worker_' + str(i), 'worker_' + str(j)) for j in range(num_workers + 1)] for i in
            range(num_workers + 1)]
        self.env = env
예제 #18
0
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #19
0
    def __init__(self, size, alpha):
        """
        Prioritied Experience Replay 
      
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        # I don't understand purpose of this
        # maybe to create a graph to store ranked truples?
        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #20
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
예제 #21
0
 def __init__(self,
              state_shape,
              action_shape,
              size,
              alpha=0.6,
              beta=0.4,
              beta_delta=0.001,
              epsilon=0.01):
     self.memory = self.Memory(state_shape, action_shape, size)
     self.counter = 0
     self.size = self.memory.size
     # Segment trees
     self.sum_tree = SumSegmentTree(self.size)
     self.min_tree = MinSegmentTree(self.size)
     # P.E.R. hyperparameters
     self.alpha = alpha
     self.beta = beta
     self.beta_delta = beta_delta
     self.epsilon = epsilon
     self.max_priority = 1.0
예제 #22
0
    def __init__(self,
                 size,
                 state_shape,
                 alpha,
                 n_batch_trajectories,
                 n_trajectory_steps,
                 n_emus=1):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        dsize: int
            Max number of demonstration transitions. These are retained in the 
            buffer permanently.
            https://arxiv.org/abs/1704.03732
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size,
                                                      state_shape,
                                                      n_batch_trajectories,
                                                      n_trajectory_steps,
                                                      n_emus=n_emus)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < self._size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
예제 #23
0
    def __init__(self,
                 obs_dim=obs_dim,
                 size=size,
                 batch_size=batch_size,
                 alpha=alpha,
                 n_step=n_step,
                 gamma=gamma):
        """Initialization."""
        assert alpha >= 0

        super(PrioritizedReplayBuffer,
              self).__init__(obs_dim, size, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
예제 #24
0
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0
예제 #25
0
    def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000):
        super(PrioritizedReplayMemory, self).__init__()
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame = 1

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
예제 #26
0
class PrioritizedReplayBuffer(ReplayBuffer):
    """Prioritized Replay buffer.
    
    Attributes:
        max_priority (float): max priority
        tree_ptr (int): next index of tree
        alpha (float): alpha parameter for prioritized replay buffer
        sum_tree (SumSegmentTree): sum tree for prior
        min_tree (MinSegmentTree): min tree for min prior to get max weight
        
    """
    def __init__(
        self,
        obs_dim: int,
        size: int,
        batch_size: int = 32,
        alpha: float = 0.6,
        n_step: int = 1,
        gamma: float = 0.99,
    ):
        """Initialization."""
        assert alpha >= 0

        super(PrioritizedReplayBuffer,
              self).__init__(obs_dim, size, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)

    def store(
        self,
        obs: np.ndarray,
        act: int,
        rew: float,
        next_obs: np.ndarray,
        done: bool,
    ) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]:
        """Store experience and priority."""
        transition = super().store(obs, act, rew, next_obs, done)

        if transition:
            self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha
            self.min_tree[self.tree_ptr] = self.max_priority**self.alpha
            self.tree_ptr = (self.tree_ptr + 1) % self.max_size

        return transition

    def sample_batch(self, beta: float = 0.4) -> Dict[str, np.ndarray]:
        """Sample a batch of experiences."""
        assert len(self) >= self.batch_size
        assert beta > 0

        indices = self._sample_proportional()

        obs = self.obs_buf[indices]
        next_obs = self.next_obs_buf[indices]
        acts = self.acts_buf[indices]
        rews = self.rews_buf[indices]
        done = self.done_buf[indices]
        weights = np.array([self._calculate_weight(i, beta) for i in indices])

        return dict(
            obs=obs,
            next_obs=next_obs,
            acts=acts,
            rews=rews,
            done=done,
            weights=weights,
            indices=indices,
        )

    def update_priorities(self, indices: List[int], priorities: np.ndarray):
        """Update priorities of sampled transitions."""
        assert len(indices) == len(priorities)

        for idx, priority in zip(indices, priorities):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def _sample_proportional(self) -> List[int]:
        """Sample indices based on proportions."""
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)

        return indices

    def _calculate_weight(self, idx: int, beta: float):
        """Calculate the weight of the experience at idx."""
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        weight = weight / max_weight

        return weight
예제 #27
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """
        Prioritied Experience Replay 
      
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        # I don't understand purpose of this
        # maybe to create a graph to store ranked truples?
        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        idx = self._idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._buffer) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._buffer))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._buffer))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """
        set priority of transition at index idxes[i] in buffer to priorities[i]
        """

        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._buffer)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
예제 #28
0
class ReplayMemory:
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0

    def add(self, data):
        #new_data = []
        #for i in data:
        #    i.wait_to_read()
        #    new_data.append(copyto(i))
        
        if self._next_idx >= len(self._storage):
            self._storage.append(data)
            #print self._storage
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize
        idx = self._next_idx
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha


    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta=0.4):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        #print self._it_min.min(), weights
        weights = np.array(weights)
        weights /= np.sum(weights)
        ret = []
        for i in xrange(batch_size):
            ret.append(self._storage[idxes[i]])
        return (ret, idxes, weights)

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            #print priority
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
예제 #30
0
class ProportionalReplay(ExperienceReplay):
    def __init__(self, size, alpha):
        super(ProportionalReplay, self).__init__(size)
        assert alpha >= 0
        self.alpha = alpha

        self.tree_size = 1
        while self.tree_size < self.maxsize:
            self.tree_size *= 2

        self.min_tree = MinSegmentTree(
            self.tree_size)  # for calculating maximum IS weight
        self.sum_tree = SumSegmentTree(
            self.tree_size)  # for proportional sampling
        self.max_priority = 1.0  # maximum priority we've seen so far. will be updated

    def add(self, experience):
        idx = self.next_idx  # save idx before it's changed in super call
        super().add(
            experience)  # put experience data (s,a,r,s',done) in buffer

        # give new experience max priority to ensure it's replayed at least once
        self.min_tree[idx] = self.max_priority**self.alpha
        self.sum_tree[idx] = self.max_priority**self.alpha

    # To sample a minibatch of size k, the range [0, p_total] is divided equally into k ranges.
    # Next, a value is uniformly sampled from each range.
    def sample_proportional(self, batch_size):
        idxs = []
        p_total = self.sum_tree.sum(
            0,
            len(self.buffer) -
            1)  # sum of the priorities of all experience in the buffer
        every_range_len = p_total / batch_size  # length of every range over [0,p_total] (batch_size = k)
        for i in range(batch_size):  # for each range
            mass = self.np_random.uniform(
            ) * every_range_len + i * every_range_len  # uniformly sampling a probability mass from this range
            idx = self.sum_tree.find_prefixsum_idx(
                mass
            )  # get smallest experience index s.t. cumulative dist F(idx) >= mass
            idxs.append(idx)
        return idxs

    # sample batch of experiences along with their weights and indices
    def sample(self, batch_size, beta):
        assert beta > 0
        idxs = self.sample_proportional(
            batch_size)  # sampled experience indices

        weights = []
        p_min = self.min_tree.min() / self.sum_tree.sum(
        )  # minimum possible priority for a transition
        max_weight = (p_min * len(self.buffer))**(
            -beta)  # (p_uniform/p_min)^beta is maximum possible IS weight

        # get IS weights for sampled experience
        for idx in idxs:
            p_sample = self.sum_tree[idx] / self.sum_tree.sum(
            )  # normalize sampled priority
            weight = (p_sample * len(self.buffer))**(
                -beta)  # (p_uniform/p_sample)^beta. IS weight
            weights.append(
                weight / max_weight
            )  # weights normalized by max so that they only scale the update downwards
        weights = np.array(weights)

        encoded_sample = self.encode_samples(
            idxs)  # collect experience at given indices
        return tuple(list(encoded_sample) + [weights, idxs])

    # set the priorities of experiences at given indices
    def update_priorities(self, idxs, priorities):
        assert len(idxs) == len(priorities)
        for idx, priority in zip(idxs, priorities):
            assert priority > 0
            assert 0 <= idx < len(self.buffer)
            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)
예제 #31
0
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, action_size, buffer_size, batch_size, alpha):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            alpha (float): alpha PER value 
        """
        self.max_priority = 1.0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        self.tree_capacity = 1
        while self.tree_capacity < buffer_size:
            self.tree_capacity *= 2

        self.sum_tree = SumSegmentTree(self.tree_capacity)
        self.min_tree = MinSegmentTree(self.tree_capacity)

        self.action_size = action_size
        self.memory = []
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, t, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)

        idx = t % self.tree_capacity
        if t >= self.tree_capacity:
            self.memory[idx] = e
        else:
            self.memory.append(e)

        # insert experience index in priority tree
        self.sum_tree[idx] = self.max_priority**self.alpha
        self.min_tree[idx] = self.max_priority**self.alpha

    def sample(self, beta):
        """Sampling a batch of relevant experiences from memory."""
        indices = self.relevant_sample_indx()

        idxs = np.vstack(indices).astype(np.int)
        states = torch.from_numpy(
            np.vstack([self.memory[i].state
                       for i in indices])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([self.memory[i].action
                       for i in indices])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([self.memory[i].reward
                       for i in indices])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([self.memory[i].next_state
                       for i in indices])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([self.memory[i].done
                       for i in indices]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(
            np.array([self.isw(i, beta) for i in indices])).float().to(device)

        return (idxs, states, actions, rewards, next_states, dones, weights)

    def relevant_sample_indx(self):
        """Selecting most informative sample indices."""
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)
            indices.append(idx)

        return indices

    def update_priorities(self, indices, priorities):
        """Update priorities of sampled transitions."""
        assert indices.shape[0] == priorities.shape[0]

        for idx, priority in zip(indices.flatten(), priorities.flatten()):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def isw(self, idx, beta):
        """Compute Importance Sample Weight."""
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        is_weight = weight / max_weight

        return is_weight

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
예제 #32
0
class PrioritizedReplayBuffer(ReplayBuffer):
    """Fixed-size prioritized buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"):
        """Initialize a PrioritizedReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization)
            beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction)
        """
        super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device)

        self.alpha = alpha
        self.beta = beta
        self._eps = 0.00000001

        it_capacity = 1
        while it_capacity < buffer_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        idx = self._next_idx
        super().add(state, action, reward, next_state, done)

        self._it_sum[idx] = self._max_priority ** self.alpha
        self._it_min[idx] = self._max_priority ** self.alpha

    def _sample_proportional(self):
        res = []
        p_total = self._it_sum.sum(0, len(self.memory) - 1)
        every_range_len = p_total / self.batch_size
        for i in range(self.batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self):
        idxes = self._sample_proportional()

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self.memory) + self._eps) ** (-self.beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self.memory) + self._eps) ** (-self.beta)
            weights.append(weight / max_weight)

        weights = torch.tensor(weights, device=self.device, dtype=torch.float)

        states = torch.from_numpy(np.vstack([self.memory[i].state for i in idxes])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([self.memory[i].action for i in idxes])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([self.memory[i].reward for i in idxes])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([self.memory[i].next_state for i in idxes])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([self.memory[i].done for i in idxes]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones, idxes, weights)

    def update_priorities(self, indexes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index indexes[i] in buffer
        to priorities[i].
        Parameters
        ----------
        indexes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        for idx, priority in zip(indexes, priorities):
            self._it_sum[idx] = priority ** self.alpha
            self._it_min[idx] = priority ** self.alpha

            self._max_priority = max(self._max_priority, priority)
예제 #33
0
class ReplayMemory:
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0

    def add(self, data):
        #new_data = []
        #for i in data:
        #    i.wait_to_read()
        #    new_data.append(copyto(i))

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
            #print self._storage
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize
        idx = self._next_idx
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta=0.4):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        #print self._it_min.min(), weights
        weights = np.array(weights)
        weights /= np.sum(weights)
        ret = []
        for i in xrange(batch_size):
            ret.append(self._storage[idxes[i]])
        return (ret, idxes, weights)

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        #print priorities, np.sum(priorities)
        for idx, priority in zip(idxes, priorities):
            #print priority
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
예제 #34
0
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        --------
        ReplayBuffer.__init__
        """
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
            res.append(idx)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.


        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        """
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        Parameters
        ----------
        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        """
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)