Exemplo n.º 1
    def __init__(self,
        # prioritized replay memory
        self._storage = []
        self.capacity = capacity
        self._next_idx = 0

        assert priority_fraction >= 0
        self._alpha = priority_fraction

        it_capacity = 1
        while it_capacity < capacity:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self.discount_gamma_game_reward = discount_gamma_game_reward
        self.discount_gamma_graph_reward = discount_gamma_graph_reward
        self.discount_gamma_count_reward = discount_gamma_count_reward
        self.accumulate_reward_from_final = accumulate_reward_from_final
Exemplo n.º 2
    def __init__(self, memory_size=1000000, alpha=0.5, seed=None):
        Prioritized replay buffer from https://arxiv.org/pdf/1511.05952.pdf
        This implementation is based on the OpenAI sumtree implemenation which can be found here
        memory_size: int
            maximum number of experiences to store
        alpha: float, [0.0, 1.0]
            hyperparameter that controls the amount of prioritization, with 0.0 being 
            no prioritization (the uniform case)
        seed: None or int
            random seed for the replay buffer
        super().__init__(memory_size=memory_size, seed=seed)
        self.alpha = alpha

        it_capacity = 1
        while it_capacity < self._memory_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 3
    def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"):
        """Initialize a PrioritizedReplayBuffer object.

            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization)
            beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction)
        super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device)

        self.alpha = alpha
        self.beta = beta
        self._eps = 0.00000001

        it_capacity = 1
        while it_capacity < buffer_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 4
    def __init__(
        obs_dim: list,
        size: int,
        device: str,
        batch_size: int = 32,
        alpha: float = 0.6,
        n_step: int = 1,
        gamma: float = 0.99,
        assert alpha >= 0

              self).__init__(obs_dim, size, device, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
Exemplo n.º 5
    def __init__(self, action_size, buffer_size, batch_size, alpha):
        """Initialize a ReplayBuffer object.

            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            alpha (float): alpha PER value 
        self.max_priority = 1.0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        self.tree_capacity = 1
        while self.tree_capacity < buffer_size:
            self.tree_capacity *= 2

        self.sum_tree = SumSegmentTree(self.tree_capacity)
        self.min_tree = MinSegmentTree(self.tree_capacity)

        self.action_size = action_size
        self.memory = []
        self.batch_size = batch_size
        self.experience = namedtuple(
            field_names=["state", "action", "reward", "next_state", "done"])
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 7
    def __init__(self, learner_config, env_config, session_config):
        Create Prioritized Replay buffer.
        :param size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        :param alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        self._alpha = self.replay_config.alpha
        assert self._alpha > 0

        self._memory = []
        self.memory_size = self.replay_config.memory_size
        it_capacity = 1
        while it_capacity < self.memory_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 8
    def __init__(self, size, alpha=0.6):
        """Create Prioritized Replay buffer.
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)
        See Also
        #super(PrioritizedReplayBuffer, self).__init__(size)

        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.it_capacity = 1
        while self.it_capacity < size * 2:  # We use double the soft capacity of the PER for the segment trees to allow for any overflow over the soft capacity limit before samples are removed
            self.it_capacity *= 2

        self._it_sum = SumSegmentTree(self.it_capacity)
        self._it_min = MinSegmentTree(self.it_capacity)
        self._max_priority = 1.0
Exemplo n.º 9
 def __init__(self, capacity, gamma=0.99, n_steps=2, alpha=0.5):
     super(PriorityBuffer, self).__init__(capacity, gamma, n_steps)
     self.buffer = []
     self.position = 0
     self.alpha = alpha
     it_cap = 1
     while it_cap < capacity:
         it_cap *= 2
     self._it_sum = SumSegmentTree(it_cap)
     self._it_min = MinSegmentTree(it_cap)
     self._max_priority = 1.0
 def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6):
     super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed)
     #capacity must be positive and a power of 2
     tree_capacity = 1
     while tree_capacity < self.buffer_size:
         tree_capacity *= 2
     self.sum_tree = SumSegmentTree(tree_capacity)
     self.min_tree = MinSegmentTree(tree_capacity)
     self.max_priority, self.tree_ptr = 1.0, 0
     self.alpha = alpha
Exemplo n.º 11
    def __init__(self, size, alpha):
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 12
    def __init__(self, size, alpha):
        super().__init__(size, N_Step_Transition)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 13
    def __init__(self, obs_space, action_space, capacity, exponent, device, optimize_memory_usage=False):
        super().__init__(obs_space, action_space, capacity, device,
        assert exponent >= 0
        self.exponent = exponent

        it_capacity = 1
        while it_capacity < self.capacity:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 14
    def __init__(self, size, alpha):
        super(ProportionalReplay, self).__init__(size)
        assert alpha >= 0
        self.alpha = alpha

        self.tree_size = 1
        while self.tree_size < self.maxsize:
            self.tree_size *= 2

        self.min_tree = MinSegmentTree(
            self.tree_size)  # for calculating maximum IS weight
        self.sum_tree = SumSegmentTree(
            self.tree_size)  # for proportional sampling
        self.max_priority = 1.0  # maximum priority we've seen so far. will be updated
Exemplo n.º 15
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0
Exemplo n.º 16
    def __init__(self, buffer_size, input_dim, batch_size, alpha):

        super(PrioritizedReplayBuffer, self).__init__(buffer_size, input_dim,

        # For PER. Parameter settings.
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        tree_capacity = 1
        while tree_capacity < self.buffer_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
    def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes, lock):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_losses = []
        self.episode_mean_values = []
        self.lock = lock
        it_capacity = 1
        while it_capacity < max_memory:
            it_capacity *= 2
        self._it_sum = [SumSegmentTree(it_capacity)]
        self._it_min = [MinSegmentTree(it_capacity)]
        self.pre_t_m_loss = 1e5
        self.unpermit = True

        # self.replaymemory = ReplayMemory(max_memory)
        global worker_num
        # self.local_AC = AC_Network(sess, s_size, a_size, self.name, None)
        self.local_AC = AC_Network(sess, s_size, a_size, self.name, self.trainer, self._it_sum, self._it_min)
        worker_num += 1
        self.update_local_ops = update_target_graph(self.local_AC.target_scope, self.name)
        self.update_to_global_ops = update_target_graph(self.name, "worker_" + str(num_workers))

        self.update_ops = [[update_target_graph('worker_' + str(i), 'worker_' + str(j)) for j in range(num_workers + 1)]
                           for i in range(num_workers + 1)]
        self.update_part_ops = [
            [update_target_graph_part('worker_' + str(i), 'worker_' + str(j)) for j in range(num_workers + 1)] for i in
            range(num_workers + 1)]
        self.env = env
Exemplo n.º 18
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 19
    def __init__(self, size, alpha):
        Prioritied Experience Replay 
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        # I don't understand purpose of this
        # maybe to create a graph to store ranked truples?
        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 20
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Exemplo n.º 21
 def __init__(self,
     self.memory = self.Memory(state_shape, action_shape, size)
     self.counter = 0
     self.size = self.memory.size
     # Segment trees
     self.sum_tree = SumSegmentTree(self.size)
     self.min_tree = MinSegmentTree(self.size)
     # P.E.R. hyperparameters
     self.alpha = alpha
     self.beta = beta
     self.beta_delta = beta_delta
     self.epsilon = epsilon
     self.max_priority = 1.0
Exemplo n.º 22
    def __init__(self,
        """Create Prioritized Replay buffer.

        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        dsize: int
            Max number of demonstration transitions. These are retained in the 
            buffer permanently.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        super(PrioritizedReplayBuffer, self).__init__(size,
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < self._size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
Exemplo n.º 23
    def __init__(self,
        assert alpha >= 0

              self).__init__(obs_dim, size, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)
Exemplo n.º 24
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0
Exemplo n.º 25
    def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000):
        super(PrioritizedReplayMemory, self).__init__()
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

        assert alpha >= 0
        self._alpha = alpha

        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame = 1

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self.experience = namedtuple(
            field_names=["state", "action", "reward", "next_state", "done"])
Exemplo n.º 26
class PrioritizedReplayBuffer(ReplayBuffer):
    """Prioritized Replay buffer.
        max_priority (float): max priority
        tree_ptr (int): next index of tree
        alpha (float): alpha parameter for prioritized replay buffer
        sum_tree (SumSegmentTree): sum tree for prior
        min_tree (MinSegmentTree): min tree for min prior to get max weight
    def __init__(
        obs_dim: int,
        size: int,
        batch_size: int = 32,
        alpha: float = 0.6,
        n_step: int = 1,
        gamma: float = 0.99,
        assert alpha >= 0

              self).__init__(obs_dim, size, batch_size, n_step, gamma)
        self.max_priority, self.tree_ptr = 1.0, 0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        tree_capacity = 1
        while tree_capacity < self.max_size:
            tree_capacity *= 2

        self.sum_tree = SumSegmentTree(tree_capacity)
        self.min_tree = MinSegmentTree(tree_capacity)

    def store(
        obs: np.ndarray,
        act: int,
        rew: float,
        next_obs: np.ndarray,
        done: bool,
    ) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]:
        """Store experience and priority."""
        transition = super().store(obs, act, rew, next_obs, done)

        if transition:
            self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha
            self.min_tree[self.tree_ptr] = self.max_priority**self.alpha
            self.tree_ptr = (self.tree_ptr + 1) % self.max_size

        return transition

    def sample_batch(self, beta: float = 0.4) -> Dict[str, np.ndarray]:
        """Sample a batch of experiences."""
        assert len(self) >= self.batch_size
        assert beta > 0

        indices = self._sample_proportional()

        obs = self.obs_buf[indices]
        next_obs = self.next_obs_buf[indices]
        acts = self.acts_buf[indices]
        rews = self.rews_buf[indices]
        done = self.done_buf[indices]
        weights = np.array([self._calculate_weight(i, beta) for i in indices])

        return dict(

    def update_priorities(self, indices: List[int], priorities: np.ndarray):
        """Update priorities of sampled transitions."""
        assert len(indices) == len(priorities)

        for idx, priority in zip(indices, priorities):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def _sample_proportional(self) -> List[int]:
        """Sample indices based on proportions."""
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)

        return indices

    def _calculate_weight(self, idx: int, beta: float):
        """Calculate the weight of the experience at idx."""
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        weight = weight / max_weight

        return weight
Exemplo n.º 27
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        Prioritied Experience Replay 
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        # I don't understand purpose of this
        # maybe to create a graph to store ranked truples?
        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        idx = self._idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._buffer) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self, batch_size, beta):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._buffer))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._buffer))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        set priority of transition at index idxes[i] in buffer to priorities[i]

        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._buffer)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Exemplo n.º 28
class ReplayMemory:
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0

    def add(self, data):
        #new_data = []
        #for i in data:
        #    i.wait_to_read()
        #    new_data.append(copyto(i))
        if self._next_idx >= len(self._storage):
            #print self._storage
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize
        idx = self._next_idx
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self, batch_size, beta=0.4):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        #print self._it_min.min(), weights
        weights = np.array(weights)
        weights /= np.sum(weights)
        ret = []
        for i in xrange(batch_size):
        return (ret, idxes, weights)

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            #print priority
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)
Exemplo n.º 29
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha >= 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        p_total = self._it_sum.sum(0, len(self._storage) - 1)
        every_range_len = p_total / batch_size
        for i in range(batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.

        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Exemplo n.º 30
class ProportionalReplay(ExperienceReplay):
    def __init__(self, size, alpha):
        super(ProportionalReplay, self).__init__(size)
        assert alpha >= 0
        self.alpha = alpha

        self.tree_size = 1
        while self.tree_size < self.maxsize:
            self.tree_size *= 2

        self.min_tree = MinSegmentTree(
            self.tree_size)  # for calculating maximum IS weight
        self.sum_tree = SumSegmentTree(
            self.tree_size)  # for proportional sampling
        self.max_priority = 1.0  # maximum priority we've seen so far. will be updated

    def add(self, experience):
        idx = self.next_idx  # save idx before it's changed in super call
            experience)  # put experience data (s,a,r,s',done) in buffer

        # give new experience max priority to ensure it's replayed at least once
        self.min_tree[idx] = self.max_priority**self.alpha
        self.sum_tree[idx] = self.max_priority**self.alpha

    # To sample a minibatch of size k, the range [0, p_total] is divided equally into k ranges.
    # Next, a value is uniformly sampled from each range.
    def sample_proportional(self, batch_size):
        idxs = []
        p_total = self.sum_tree.sum(
            len(self.buffer) -
            1)  # sum of the priorities of all experience in the buffer
        every_range_len = p_total / batch_size  # length of every range over [0,p_total] (batch_size = k)
        for i in range(batch_size):  # for each range
            mass = self.np_random.uniform(
            ) * every_range_len + i * every_range_len  # uniformly sampling a probability mass from this range
            idx = self.sum_tree.find_prefixsum_idx(
            )  # get smallest experience index s.t. cumulative dist F(idx) >= mass
        return idxs

    # sample batch of experiences along with their weights and indices
    def sample(self, batch_size, beta):
        assert beta > 0
        idxs = self.sample_proportional(
            batch_size)  # sampled experience indices

        weights = []
        p_min = self.min_tree.min() / self.sum_tree.sum(
        )  # minimum possible priority for a transition
        max_weight = (p_min * len(self.buffer))**(
            -beta)  # (p_uniform/p_min)^beta is maximum possible IS weight

        # get IS weights for sampled experience
        for idx in idxs:
            p_sample = self.sum_tree[idx] / self.sum_tree.sum(
            )  # normalize sampled priority
            weight = (p_sample * len(self.buffer))**(
                -beta)  # (p_uniform/p_sample)^beta. IS weight
                weight / max_weight
            )  # weights normalized by max so that they only scale the update downwards
        weights = np.array(weights)

        encoded_sample = self.encode_samples(
            idxs)  # collect experience at given indices
        return tuple(list(encoded_sample) + [weights, idxs])

    # set the priorities of experiences at given indices
    def update_priorities(self, idxs, priorities):
        assert len(idxs) == len(priorities)
        for idx, priority in zip(idxs, priorities):
            assert priority > 0
            assert 0 <= idx < len(self.buffer)
            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)
Exemplo n.º 31
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, action_size, buffer_size, batch_size, alpha):
        """Initialize a ReplayBuffer object.

            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            alpha (float): alpha PER value 
        self.max_priority = 1.0
        self.alpha = alpha

        # capacity must be positive and a power of 2.
        self.tree_capacity = 1
        while self.tree_capacity < buffer_size:
            self.tree_capacity *= 2

        self.sum_tree = SumSegmentTree(self.tree_capacity)
        self.min_tree = MinSegmentTree(self.tree_capacity)

        self.action_size = action_size
        self.memory = []
        self.batch_size = batch_size
        self.experience = namedtuple(
            field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, t, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)

        idx = t % self.tree_capacity
        if t >= self.tree_capacity:
            self.memory[idx] = e

        # insert experience index in priority tree
        self.sum_tree[idx] = self.max_priority**self.alpha
        self.min_tree[idx] = self.max_priority**self.alpha

    def sample(self, beta):
        """Sampling a batch of relevant experiences from memory."""
        indices = self.relevant_sample_indx()

        idxs = np.vstack(indices).astype(np.int)
        states = torch.from_numpy(
                       for i in indices])).float().to(device)
        actions = torch.from_numpy(
                       for i in indices])).long().to(device)
        rewards = torch.from_numpy(
                       for i in indices])).float().to(device)
        next_states = torch.from_numpy(
                       for i in indices])).float().to(device)
        dones = torch.from_numpy(
                       for i in indices]).astype(np.uint8)).float().to(device)
        weights = torch.from_numpy(
            np.array([self.isw(i, beta) for i in indices])).float().to(device)

        return (idxs, states, actions, rewards, next_states, dones, weights)

    def relevant_sample_indx(self):
        """Selecting most informative sample indices."""
        indices = []
        p_total = self.sum_tree.sum(0, len(self) - 1)
        segment = p_total / self.batch_size

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)
            upperbound = random.uniform(a, b)
            idx = self.sum_tree.retrieve(upperbound)

        return indices

    def update_priorities(self, indices, priorities):
        """Update priorities of sampled transitions."""
        assert indices.shape[0] == priorities.shape[0]

        for idx, priority in zip(indices.flatten(), priorities.flatten()):
            assert priority > 0
            assert 0 <= idx < len(self)

            self.sum_tree[idx] = priority**self.alpha
            self.min_tree[idx] = priority**self.alpha

            self.max_priority = max(self.max_priority, priority)

    def isw(self, idx, beta):
        """Compute Importance Sample Weight."""
        # get max weight
        p_min = self.min_tree.min() / self.sum_tree.sum()
        max_weight = (p_min * len(self))**(-beta)

        # calculate weights
        p_sample = self.sum_tree[idx] / self.sum_tree.sum()
        weight = (p_sample * len(self))**(-beta)
        is_weight = weight / max_weight

        return is_weight

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
Exemplo n.º 32
class PrioritizedReplayBuffer(ReplayBuffer):
    """Fixed-size prioritized buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed, alpha=0.6, beta=0.5, device="cpu"):
        """Initialize a PrioritizedReplayBuffer object.

            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
            alpha (float): how much prioritization is used (0 - no prioritization, 1 - full prioritization)
            beta (float): To what degree to use importance weights (0 - no corrections, 1 - full correction)
        super(PrioritizedReplayBuffer, self).__init__(action_size, buffer_size, batch_size, seed, device=device)

        self.alpha = alpha
        self.beta = beta
        self._eps = 0.00000001

        it_capacity = 1
        while it_capacity < buffer_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        idx = self._next_idx
        super().add(state, action, reward, next_state, done)

        self._it_sum[idx] = self._max_priority ** self.alpha
        self._it_min[idx] = self._max_priority ** self.alpha

    def _sample_proportional(self):
        res = []
        p_total = self._it_sum.sum(0, len(self.memory) - 1)
        every_range_len = p_total / self.batch_size
        for i in range(self.batch_size):
            mass = random.random() * every_range_len + i * every_range_len
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self):
        idxes = self._sample_proportional()

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self.memory) + self._eps) ** (-self.beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self.memory) + self._eps) ** (-self.beta)
            weights.append(weight / max_weight)

        weights = torch.tensor(weights, device=self.device, dtype=torch.float)

        states = torch.from_numpy(np.vstack([self.memory[i].state for i in idxes])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([self.memory[i].action for i in idxes])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([self.memory[i].reward for i in idxes])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([self.memory[i].next_state for i in idxes])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([self.memory[i].done for i in idxes]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones, idxes, weights)

    def update_priorities(self, indexes, priorities):
        """Update priorities of sampled transitions.
        sets priority of transition at index indexes[i] in buffer
        to priorities[i].
        indexes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        for idx, priority in zip(indexes, priorities):
            self._it_sum[idx] = priority ** self.alpha
            self._it_min[idx] = priority ** self.alpha

            self._max_priority = max(self._max_priority, priority)
Exemplo n.º 33
class ReplayMemory:
    def __init__(self, replay_size, alpha=0.6):
        self.replay_size = replay_size
        self.cnt = 0
        self._alpha = alpha
        it_capacity = 1
        while it_capacity < replay_size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0
        self._storage = []
        self._maxsize = replay_size
        self._next_idx = 0

    def add(self, data):
        #new_data = []
        #for i in data:
        #    i.wait_to_read()
        #    new_data.append(copyto(i))

        if self._next_idx >= len(self._storage):
            #print self._storage
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize
        idx = self._next_idx
        self._it_sum[idx] = self._max_priority**self._alpha
        self._it_min[idx] = self._max_priority**self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            mass = random.random() * self._it_sum.sum(0,
                                                      len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self, batch_size, beta=0.4):
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage))**(-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage))**(-beta)
            weights.append(weight / max_weight)
        #print self._it_min.min(), weights
        weights = np.array(weights)
        weights /= np.sum(weights)
        ret = []
        for i in xrange(batch_size):
        return (ret, idxes, weights)

    def update_priorities(self, idxes, priorities):
        assert len(idxes) == len(priorities)
        #print priorities, np.sum(priorities)
        for idx, priority in zip(idxes, priorities):
            #print priority
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority**self._alpha
            self._it_min[idx] = priority**self._alpha

            self._max_priority = max(self._max_priority, priority)
Exemplo n.º 34
class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, size, alpha):
        """Create Prioritized Replay buffer.

        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        alpha: float
            how much prioritization is used
            (0 - no prioritization, 1 - full prioritization)

        See Also
        super(PrioritizedReplayBuffer, self).__init__(size)
        assert alpha > 0
        self._alpha = alpha

        it_capacity = 1
        while it_capacity < size:
            it_capacity *= 2

        self._it_sum = SumSegmentTree(it_capacity)
        self._it_min = MinSegmentTree(it_capacity)
        self._max_priority = 1.0

    def add(self, *args, **kwargs):
        """See ReplayBuffer.store_effect"""
        idx = self._next_idx
        super().add(*args, **kwargs)
        self._it_sum[idx] = self._max_priority ** self._alpha
        self._it_min[idx] = self._max_priority ** self._alpha

    def _sample_proportional(self, batch_size):
        res = []
        for _ in range(batch_size):
            # TODO(szymon): should we ensure no repeats?
            mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
            idx = self._it_sum.find_prefixsum_idx(mass)
        return res

    def sample(self, batch_size, beta):
        """Sample a batch of experiences.

        compared to ReplayBuffer.sample
        it also returns importance weights and idxes
        of sampled experiences.

        batch_size: int
            How many transitions to sample.
        beta: float
            To what degree to use importance weights
            (0 - no corrections, 1 - full correction)

        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        weights: np.array
            Array of shape (batch_size,) and dtype np.float32
            denoting importance weight of each sampled transition
        idxes: np.array
            Array of shape (batch_size,) and dtype np.int32
            idexes in buffer of sampled experiences
        assert beta > 0

        idxes = self._sample_proportional(batch_size)

        weights = []
        p_min = self._it_min.min() / self._it_sum.sum()
        max_weight = (p_min * len(self._storage)) ** (-beta)

        for idx in idxes:
            p_sample = self._it_sum[idx] / self._it_sum.sum()
            weight = (p_sample * len(self._storage)) ** (-beta)
            weights.append(weight / max_weight)
        weights = np.array(weights)
        encoded_sample = self._encode_sample(idxes)
        return tuple(list(encoded_sample) + [weights, idxes])

    def update_priorities(self, idxes, priorities):
        """Update priorities of sampled transitions.

        sets priority of transition at index idxes[i] in buffer
        to priorities[i].

        idxes: [int]
            List of idxes of sampled transitions
        priorities: [float]
            List of updated priorities corresponding to
            transitions at the sampled idxes denoted by
            variable `idxes`.
        assert len(idxes) == len(priorities)
        for idx, priority in zip(idxes, priorities):
            assert priority > 0
            assert 0 <= idx < len(self._storage)
            self._it_sum[idx] = priority ** self._alpha
            self._it_min[idx] = priority ** self._alpha

            self._max_priority = max(self._max_priority, priority)