def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0
def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): # TODO(szymon): should we ensure no repeats? mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority)
class ReplayMemory: def __init__(self, replay_size, alpha=0.6): self.replay_size = replay_size self.cnt = 0 self._alpha = alpha it_capacity = 1 while it_capacity < replay_size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._storage = [] self._maxsize = replay_size self._next_idx = 0 def add(self, data): #new_data = [] #for i in data: # i.wait_to_read() # new_data.append(copyto(i)) if self._next_idx >= len(self._storage): self._storage.append(data) #print self._storage else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self._maxsize idx = self._next_idx self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta=0.4): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage)) ** (-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage)) ** (-beta) weights.append(weight / max_weight) #print self._it_min.min(), weights weights = np.array(weights) weights /= np.sum(weights) ret = [] for i in xrange(batch_size): ret.append(self._storage[idxes[i]]) return (ret, idxes, weights) def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): #print priority assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(SimpleReplayBuffer): def __init__(self, MAX_LEN, alpha: float = 0.6): """ Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- SimpleReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(MAX_LEN) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < MAX_LEN: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, experience: Experience): """See SimpleReplayBuffer.store_effect""" idx = self._next_idx super().add(experience) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size: int) -> List[int]: res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample( self, batch_size: int, beta: float = 0.4 ) -> Tuple[List[Experience], np.ndarray, List[int]]: # type: ignore """Sample a batch of experiences. compared to SimpleReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- experiences: List[Experience] batch of experiences weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return (encoded_sample, weights, idxes) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self._storage) - 1) every_range_len = p_total / batch_size for i in range(batch_size): mass = random.random() * every_range_len + i * every_range_len idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): """Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. Parameters ---------- batch_size: int How many transitions to sample. beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) Returns ------- obs_batch: np.array batch of observations act_batch: np.array batch of actions executed given obs_batch rew_batch: np.array rewards received as results of executing act_batch next_obs_batch: np.array next set of observations seen after executing act_batch done_mask: np.array done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. weights: np.array Array of shape (batch_size,) and dtype np.float32 denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayMemory(object): def __init__(self, capacity=100000, priority_fraction=0.0, discount_gamma_game_reward=1.0, discount_gamma_graph_reward=1.0, discount_gamma_count_reward=1.0, accumulate_reward_from_final=False): # prioritized replay memory self._storage = [] self.capacity = capacity self._next_idx = 0 assert priority_fraction >= 0 self._alpha = priority_fraction it_capacity = 1 while it_capacity < capacity: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self.discount_gamma_game_reward = discount_gamma_game_reward self.discount_gamma_graph_reward = discount_gamma_graph_reward self.discount_gamma_count_reward = discount_gamma_count_reward self.accumulate_reward_from_final = accumulate_reward_from_final def __len__(self): return len(self._storage) @property def storage(self): """[(np.ndarray, float, float, np.ndarray, bool)]: content of the replay buffer""" return self._storage @property def buffer_size(self): """float: Max capacity of the buffer""" return self.capacity def can_sample(self, n_samples): """ Check if n_samples samples can be sampled from the buffer. :param n_samples: (int) :return: (bool) """ return len(self) >= n_samples def is_full(self): """ Check whether the replay buffer is full or not. :return: (bool) """ return len(self) == self.buffer_size def add(self, *args): """ add a new transition to the buffer """ idx = self._next_idx data = Transition(*args) if self._next_idx >= len(self._storage): self._storage.append(data) else: self._storage[self._next_idx] = data self._next_idx = (self._next_idx + 1) % self.capacity self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def get_next_final_pos(self, which_memory, head): i = head while True: if i >= len(self._storage): return None if self._storage[i].is_final: return i i += 1 return None def _get_single_transition(self, idx, n): assert n > 0 head = idx # if n is 1, then head can't be is_final if n == 1: if self._storage[head].is_final: return None # if n > 1, then all except tail can't be is_final else: if np.any([item.is_final for item in self._storage[head:head + n]]): return None next_final = self.get_next_final_pos(self._storage, head) if next_final is None: return None # all good obs = self._storage[head].observation_list candidate = self._storage[head].action_candidate_list chosen_indices = self._storage[head].chosen_indices graph_triplets = self._storage[head].graph_triplets next_obs = self._storage[head + n].observation_list next_candidate = self._storage[head + n].action_candidate_list next_graph_triplets = self._storage[head + n].graph_triplets tmp = next_final - head + 1 if self.accumulate_reward_from_final else n + 1 rewards_up_to_next_final = [ self.discount_gamma_game_reward**i * self._storage[head + i].reward for i in range(tmp) ] reward = torch.sum(torch.stack(rewards_up_to_next_final)) graph_rewards_up_to_next_final = [ self.discount_gamma_graph_reward**i * self._storage[head + i].graph_reward for i in range(tmp) ] graph_reward = torch.sum(torch.stack(graph_rewards_up_to_next_final)) count_rewards_up_to_next_final = [ self.discount_gamma_count_reward**i * self._storage[head + i].count_reward for i in range(tmp) ] count_reward = torch.sum(torch.stack(count_rewards_up_to_next_final)) return (obs, candidate, chosen_indices, graph_triplets, reward + graph_reward + count_reward, next_obs, next_candidate, next_graph_triplets) def _encode_sample(self, idxes, ns): actual_indices, actual_ns = [], [] obs, candidate, chosen_indices, graph_triplets, reward, next_obs, next_candidate, next_graph_triplets = [], [], [], [], [], [], [], [] for i, n in zip(idxes, ns): t = self._get_single_transition(i, n) if t is None: continue actual_indices.append(i) actual_ns.append(n) obs.append(t[0]) candidate.append(t[1]) chosen_indices.append(t[2]) graph_triplets.append(t[3]) reward.append(t[4]) next_obs.append(t[5]) next_candidate.append(t[6]) next_graph_triplets.append(t[7]) if len(actual_indices) == 0: return None chosen_indices = np.array(chosen_indices) # batch reward = torch.stack(reward, 0) # batch actual_ns = np.array(actual_ns) return [ obs, candidate, chosen_indices, graph_triplets, reward, next_obs, next_candidate, next_graph_triplets, actual_indices, actual_ns ] def sample(self, batch_size, beta=0, multi_step=1): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) # sample n ns = np.random.randint(1, multi_step + 1, size=batch_size) encoded_sample = self._encode_sample(idxes, ns) if encoded_sample is None: return None actual_indices = encoded_sample[-2] for idx in actual_indices: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) return encoded_sample + [weights] def _get_single_sequence_transition(self, idx, sample_history_length): assert sample_history_length > 0 head = idx # if n is 1, then head can't be is_final if sample_history_length == 1: if self._storage[head].is_final: return None # if n > 1, then all except tail can't be is_final else: if np.any([ item.is_final for item in self._storage[head:head + sample_history_length] ]): return None next_final = self.get_next_final_pos(self._storage, head) if next_final is None: return None # all good res = [] for m in range(sample_history_length): obs = self._storage[head + m].observation_list candidate = self._storage[head + m].action_candidate_list chosen_indices = self._storage[head + m].chosen_indices graph_triplets = self._storage[head + m].graph_triplets next_obs = self._storage[head + m + 1].observation_list next_candidate = self._storage[head + m + 1].action_candidate_list next_graph_triplets = self._storage[head + m + 1].graph_triplets tmp = next_final - ( head + m) + 1 if self.accumulate_reward_from_final else 1 rewards_up_to_next_final = [ self.discount_gamma_game_reward**i * self._storage[head + m + i].reward for i in range(tmp) ] reward = torch.sum(torch.stack(rewards_up_to_next_final)) graph_rewards_up_to_next_final = [ self.discount_gamma_graph_reward**i * self._storage[head + m + i].graph_reward for i in range(tmp) ] graph_reward = torch.sum( torch.stack(graph_rewards_up_to_next_final)) count_rewards_up_to_next_final = [ self.discount_gamma_count_reward**i * self._storage[head + m + i].count_reward for i in range(tmp) ] count_reward = torch.sum( torch.stack(count_rewards_up_to_next_final)) res.append([ obs, candidate, chosen_indices, graph_triplets, reward + graph_reward + count_reward, next_obs, next_candidate, next_graph_triplets ]) return res def _encode_sample_sequence(self, idxes, sample_history_length): assert sample_history_length > 0 res = [] for _ in range(sample_history_length): tmp = [] for i in range(8): tmp.append([]) res.append(tmp) actual_indices = [] # obs, candidate, chosen_indices, graph_triplets, reward, next_obs, next_candidate, next_graph_triplets for i in idxes: t = self._get_single_sequence_transition(i, sample_history_length) if t is None: continue actual_indices.append(i) for step in range(sample_history_length): t_s = t[step] res[step][0].append(t_s[0]) res[step][1].append(t_s[1]) res[step][2].append(t_s[2]) res[step][3].append(t_s[3]) res[step][4].append(t_s[4]) res[step][5].append(t_s[5]) res[step][6].append(t_s[6]) res[step][7].append(t_s[7]) if len(actual_indices) == 0: return None for i in range(sample_history_length): res[i][2] = np.array(res[i][2]) # batch res[i][4] = torch.stack(res[i][4], 0) # batch return res + [actual_indices] def sample_sequence(self, batch_size, beta=0, sample_history_length=1): assert beta > 0 idxes = self._sample_proportional(batch_size) res_weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) encoded_sample = self._encode_sample_sequence(idxes, sample_history_length) if encoded_sample is None: return None actual_indices = encoded_sample[-1] for _h in range(sample_history_length): tmp_weights = [] for idx in actual_indices: p_sample = self._it_sum[idx + _h] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) tmp_weights.append(weight / max_weight) tmp_weights = np.array(tmp_weights) res_weights.append(tmp_weights) return encoded_sample + [res_weights] def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def update_priorities(self, idxes, priorities): """ Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. :param idxes: ([int]) List of idxes of sampled transitions :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority) def avg_rewards(self): if len(self._storage) == 0: return 0.0 rewards = [self._storage[i].reward for i in range(len(self._storage))] return to_np(torch.mean(torch.stack(rewards)))
class PrioritizedReplayBuffer(ReplayBuffer): """Prioritized Replay buffer. Attributes: max_priority (float): max priority tree_ptr (int): next index of tree alpha (float): alpha parameter for prioritized replay buffer sum_tree (SumSegmentTree): sum tree for prior min_tree (MinSegmentTree): min tree for min prior to get max weight """ def __init__( self, obs_dim: int, size: int, batch_size: int = 32, alpha: float = 0.6, n_step: int = 1, gamma: float = 0.99, ): """Initialization.""" assert alpha >= 0 super(PrioritizedReplayBuffer, self).__init__(obs_dim, size, batch_size, n_step, gamma) self.max_priority, self.tree_ptr = 1.0, 0 self.alpha = alpha # capacity must be positive and a power of 2. tree_capacity = 1 while tree_capacity < self.max_size: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) self.min_tree = MinSegmentTree(tree_capacity) def store( self, obs: np.ndarray, act: int, rew: float, next_obs: np.ndarray, done: bool, ) -> Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]: """Store experience and priority.""" transition = super().store(obs, act, rew, next_obs, done) if transition: self.sum_tree[self.tree_ptr] = self.max_priority**self.alpha self.min_tree[self.tree_ptr] = self.max_priority**self.alpha self.tree_ptr = (self.tree_ptr + 1) % self.max_size return transition def sample_batch(self, beta: float = 0.4) -> Dict[str, np.ndarray]: """Sample a batch of experiences.""" assert len(self) >= self.batch_size assert beta > 0 indices = self._sample_proportional() obs = self.obs_buf[indices] next_obs = self.next_obs_buf[indices] acts = self.acts_buf[indices] rews = self.rews_buf[indices] done = self.done_buf[indices] weights = np.array([self._calculate_weight(i, beta) for i in indices]) return dict( obs=obs, next_obs=next_obs, acts=acts, rews=rews, done=done, weights=weights, indices=indices, ) def update_priorities(self, indices: List[int], priorities: np.ndarray): """Update priorities of sampled transitions.""" assert len(indices) == len(priorities) for idx, priority in zip(indices, priorities): assert priority > 0 assert 0 <= idx < len(self) self.sum_tree[idx] = priority**self.alpha self.min_tree[idx] = priority**self.alpha self.max_priority = max(self.max_priority, priority) def _sample_proportional(self) -> List[int]: """Sample indices based on proportions.""" indices = [] p_total = self.sum_tree.sum(0, len(self) - 1) segment = p_total / self.batch_size for i in range(self.batch_size): a = segment * i b = segment * (i + 1) upperbound = random.uniform(a, b) idx = self.sum_tree.retrieve(upperbound) indices.append(idx) return indices def _calculate_weight(self, idx: int, beta: float): """Calculate the weight of the experience at idx.""" # get max weight p_min = self.min_tree.min() / self.sum_tree.sum() max_weight = (p_min * len(self))**(-beta) # calculate weights p_sample = self.sum_tree[idx] / self.sum_tree.sum() weight = (p_sample * len(self))**(-beta) weight = weight / max_weight return weight
class PrioritizedReplayMemory: def __init__(self, size, alpha=0.6, beta_start=0.4, beta_frames=100000): super(PrioritizedReplayMemory, self).__init__() self._storage = [] self._maxsize = size self._next_idx = 0 assert alpha >= 0 self._alpha = alpha self.beta_start = beta_start self.beta_frames = beta_frames self.frame = 1 it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) def beta_by_frame(self, frame_idx): return min( 1.0, self.beta_start + frame_idx * (1.0 - self.beta_start) / self.beta_frames) def push(self, state, action, reward, next_state, done): idx = self._next_idx if self._next_idx >= len(self._storage): self._storage.append( self.experience(state, action, reward, next_state, done)) else: self._storage[self._next_idx] = self.experience( state, action, reward, next_state, done) self._next_idx = (self._next_idx + 1) % self._maxsize self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _encode_sample(self, idxes): states = torch.from_numpy( np.vstack([self._storage[i].state for i in idxes])).float().to(device) actions = torch.from_numpy( np.vstack([self._storage[i].action for i in idxes])).float().to(device) rewards = torch.from_numpy( np.vstack([self._storage[i].reward for i in idxes])).float().to(device) next_states = torch.from_numpy( np.vstack([self._storage[i].next_state for i in idxes])).float().to(device) dones = torch.from_numpy( np.vstack([self._storage[i].done for i in idxes]).astype(np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones) def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size): idxes = self._sample_proportional(batch_size) weights = [] #find smallest sampling prob: p_min = smallest priority^alpha / sum of priorities^alpha p_min = self._it_min.min() / self._it_sum.sum() beta = self.beta_by_frame(self.frame) self.frame += 1 #max_weight given to smallest prob max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = torch.tensor(weights, device=device, dtype=torch.float) encoded_sample = self._encode_sample(idxes) return encoded_sample, idxes, weights def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert 0 <= idx < len(self._storage) self._it_sum[idx] = (priority + 1e-5)**self._alpha self._it_min[idx] = (priority + 1e-5)**self._alpha self._max_priority = max(self._max_priority, (priority + 1e-5))
class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): """Create Prioritized Replay buffer. Parameters ---------- size: int Max number of transitions to store in the buffer. When the buffer overflows the old memories are dropped. alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) See Also -------- ReplayBuffer.__init__ """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def add(self, *args, **kwargs): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args, **kwargs) self._it_sum[idx] = self._max_priority**self._alpha self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] for _ in range(batch_size): # TODO(szymon): should we ensure no repeats? mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) idx = self._it_sum.find_prefixsum_idx(mass) res.append(idx) return res def sample(self, batch_size, beta): assert beta > 0 idxes = self._sample_proportional(batch_size) weights = [] p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self._storage))**(-beta) for idx in idxes: p_sample = self._it_sum[idx] / self._it_sum.sum() weight = (p_sample * len(self._storage))**(-beta) weights.append(weight / max_weight) weights = np.array(weights) encoded_sample = self._encode_sample(idxes) return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): """Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. Parameters ---------- idxes: [int] List of idxes of sampled transitions priorities: [float] List of updated priorities corresponding to transitions at the sampled idxes denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) self._it_sum[idx] = priority**self._alpha self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority)
class PrioritizedReplayBuffer(ReplayBuffer): """ Adapt from https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/buffers.py """ def __init__(self, obs_space, action_space, capacity, exponent, device, optimize_memory_usage=False): super().__init__(obs_space, action_space, capacity, device, optimize_memory_usage=optimize_memory_usage) assert exponent >= 0 self.exponent = exponent it_capacity = 1 while it_capacity < self.capacity: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 def _sample_proportional(self, batch_size): total = self._it_sum.sum(0, len(self) - 1) mass = np.random.random(size=batch_size) * total idx = self._it_sum.find_prefixsum_idx(mass) # replace idx == self.idx if self.full and self.optimize_memory_usage: while np.any(idx == self.idx): replace_mass = np.random.random(len(idx == self.idx)) * total replace_idx = self._it_sum.find_prefixsum_idx(replace_mass) idx[idx == self.idx] = replace_idx return idx def add(self, obs, action, reward, next_obs, done): idx = self.idx super().add(obs, action, reward, next_obs, done) self._it_sum[idx] = self._max_priority ** self.exponent self._it_min[idx] = self._max_priority ** self.exponent def sample(self, batch_size, beta=0): assert beta >= 0 idxes = self._sample_proportional(batch_size) p_min = self._it_min.min() / self._it_sum.sum() max_weight = (p_min * len(self)) ** (-beta) p_sample = self._it_sum[idxes] / self._it_sum.sum() weights = (p_sample * len(self)) ** (-beta) / max_weight obses, actions, rewards, next_obses, not_dones = self._sample(idxes) priority_kwargs = { 'weights': weights, 'idxes': idxes } return obses, actions, rewards, next_obses, not_dones, priority_kwargs def update_priorities(self, idxes, priorities): assert len(idxes) == len(priorities) assert np.min(priorities) > 0 assert np.min(idxes) >= 0 assert np.max(idxes) < len(self) self._it_sum[idxes] = priorities ** self.exponent self._it_min[idxes] = priorities ** self.exponent self._max_priority = max(self._max_priority, np.max(priorities))