def __init__(self, state_shape, buffer_size): ExperienceMemory.__init__(self, state_shape, buffer_size) self.epsilon = PerSettings.epsilon self.alpha = PerSettings.alpha self.priorityTree = SumTree(self.alpha, buffer_size) self.betaInit = PerSettings.beta_init self.betaFinal = PerSettings.beta_final self.betaFinalAt = PerSettings.beta_finalAt self.beta = tf.Variable(0.0, trainable=False, dtype=tf.float32, name="beta") self.betaHolder = tf.placeholder(dtype=tf.float32) self.betaUpdater = self.beta.assign(self.betaHolder) with tf.variable_scope('AgentEnvSteps', reuse=True): self.curStep = tf.get_variable(name='agentSteps', dtype=tf.int32) pass self.impSamplingWeights = [] self.sampledMemIndexes = []
def __init__(self, max_size, window_size, input_shape): self.tree = SumTree(max_size) self._max_size = max_size self._window_size = window_size self._WIDTH = input_shape[0] self._HEIGHT = input_shape[1] self.e = 0.01 self.a = 0.6
def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity self.a = Config.A #hyperparameter used to reintroduce randomnes in the experience selection self.e = Config.E self.beta = Config.BETA self.beta_increment_per_sampling = Config.BETA_INCREMENT
def __init__(self, buffer_size, num_agents, state_size, action_size, use_PER=False): self.buffer_size = buffer_size self.use_PER = use_PER self.num_agents = num_agents self.state_size = state_size self.action_size = action_size if use_PER: self.tree = SumTree(buffer_size) #create tree instance else: self.memory = deque(maxlen=buffer_size) self.buffer_size = buffer_size self.leaves_count = 0
def __init__(self, buffer_size, batch_size, td_eps, seed, p_replay_alpha, reward_scale=False, error_clip=False, error_max=1.0, error_init=False, use_tree=False, err_init=1.0): """Initialize a ReplayBuffer object. Params ====== buffer_size (int): maximum size of buffer batch_size (int): size of each training batch td_eps: (float): to avoid zero td_error p_replay_alpha (float): discount factor for priority sampling reward_scale (flag): to scale reward down by 10 error_clip (flag): max error to 1 seed (int): random seed """ self.useTree = use_tree self.memory = deque(maxlen=buffer_size) self.tree = SumTree(buffer_size) #create tree instance self.batch_size = batch_size self.buffer_size = buffer_size self.td_eps = td_eps self.experience = namedtuple("Experience", field_names=[ "state", "action", "reward", "next_state", "done", "td_error" ]) self.seed = random.seed(seed) self.p_replay_alpha = p_replay_alpha self.reward_scale = reward_scale self.error_clip = error_clip self.error_init = error_init self.error_max = error_max self.memory_index = np.zeros([self.buffer_size, 1]) #for quicker calculation self.memory_pointer = 0
class PriorityExperienceReplay: ''' Almost copy from https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py ''' def __init__(self, max_size, window_size, input_shape): self.tree = SumTree(max_size) self._max_size = max_size self._window_size = window_size self._WIDTH = input_shape[0] self._HEIGHT = input_shape[1] self.e = 0.01 self.a = 0.6 def _getPriority(self, error): return (error + self.e)**self.a def append(self, old_state, action, reward, new_state, is_terminal): for o_s, a, r, n_s, i_t in zip(old_state, action, reward, new_state, is_terminal): # 0.5 is the maximum error p = self._getPriority(0.5) self.tree.add(p, data=(o_s, a, r, n_s, i_t)) def sample(self, batch_size, indexes=None): data_batch = [] idx_batch = [] p_batch = [] segment = self.tree.total_and_count()[0] / batch_size for i in range(batch_size): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) data_batch.append(data) idx_batch.append(idx) p_batch.append(p) zipped = list(zip(*data_batch)) zipped[0] = np.reshape( zipped[0], (-1, self._WIDTH, self._HEIGHT, self._window_size)) zipped[3] = np.reshape( zipped[3], (-1, self._WIDTH, self._HEIGHT, self._window_size)) sum_p, count = self.tree.total_and_count() return zipped, idx_batch, p_batch, sum_p, count def update(self, idx_list, error_list): for idx, error in zip(idx_list, error_list): p = self._getPriority(error) self.tree.update(idx, p)
class Memory: # stored as < s, a, s', r > in SumTree def __init__(self, capacity): self.tree = SumTree(capacity) self.capacity = capacity self.a = Config.A #hyperparameter used to reintroduce randomnes in the experience selection self.e = Config.E self.beta = Config.BETA self.beta_increment_per_sampling = Config.BETA_INCREMENT def _get_priority(self, error): return (error + self.e) ** self.a def add(self, error, *args): p = self._get_priority(error) self.tree.add(p, Transition(*args)) def sample(self, n): batch = [] idxs = [] segment = self.tree.total() / n priorities = [] self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) for i in range(n): a = segment * i b = segment * (i + 1) s = random.uniform(a, b) (idx, p, data) = self.tree.get(s) priorities.append(p) batch.append(data) idxs.append(idx) sampling_probabilities = priorities / self.tree.total() is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta) is_weight /= is_weight.max() return batch, idxs, is_weight def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p)
class PEM(ExperienceMemory): def __init__(self, state_shape, buffer_size): ExperienceMemory.__init__(self, state_shape, buffer_size) self.epsilon = PerSettings.epsilon self.alpha = PerSettings.alpha self.priorityTree = SumTree(self.alpha, buffer_size) self.betaInit = PerSettings.beta_init self.betaFinal = PerSettings.beta_final self.betaFinalAt = PerSettings.beta_finalAt self.beta = tf.Variable(0.0, trainable=False, dtype=tf.float32, name="beta") self.betaHolder = tf.placeholder(dtype=tf.float32) self.betaUpdater = self.beta.assign(self.betaHolder) with tf.variable_scope('AgentEnvSteps', reuse=True): self.curStep = tf.get_variable(name='agentSteps', dtype=tf.int32) pass self.impSamplingWeights = [] self.sampledMemIndexes = [] def betaAnneal(self, sess): pass ff = max(0, (self.betaFinal - self.betaInit) * (self.betaFinalAt - self.curStep.eval()) / self.betaFinalAt) bt = self.betaFinal - ff sess.run(self.betaUpdater, feed_dict={self.betaHolder: bt}) def add(self, experience): ExperienceMemory.add(self, experience) #init new transitions priorities with maxPriority! self.priorityTree.addNew(self.priorityTree.getMaxPriority()) def sample(self, k): pTotal = self.priorityTree.getSigmaPriority() pTot_by_k = int(pTotal // k) self.sampledMemIndexes = [] self.impSamplingWeights = [] for j in range(k): lower_bound = j * (pTot_by_k) upper_bound = (j + 1) * (pTot_by_k) sampledVal = random.sample(range(lower_bound, upper_bound), 1) sampledMemIdx, sampledPriority = self.priorityTree.getSelectedLeaf( sampledVal[0]) self.sampledMemIndexes.append(sampledMemIdx) assert sampledPriority != 0.0, "Can't progress with a sampled priority = ZERO!" sampledProb = (sampledPriority** self.alpha) / self.priorityTree.getSigmaPriority( withAlpha=True) impSampleWt = (self.buffer_size * sampledProb)**(-1 * self.beta.eval()) self.impSamplingWeights.append(impSampleWt) #normalize imp-weighted sampling maxISW = max(self.impSamplingWeights) self.impSamplingWeights[:] = [ x / maxISW for x in self.impSamplingWeights ] return self.getSamples(self.sampledMemIndexes) def getISW(self): return self.impSamplingWeights def update(self, deltas): for i, memIdx in enumerate(self.sampledMemIndexes): new_priority = math.fabs(deltas[i]) + self.epsilon self.priorityTree.updateTree(memIdx, new_priority)
class ReplayBuffer: """Fixed-size buffer to store experience tuples.""" def __init__(self, buffer_size, batch_size, td_eps, seed, p_replay_alpha, reward_scale=False, error_clip=False, error_max=1.0, error_init=False, use_tree=False, err_init=1.0): """Initialize a ReplayBuffer object. Params ====== buffer_size (int): maximum size of buffer batch_size (int): size of each training batch td_eps: (float): to avoid zero td_error p_replay_alpha (float): discount factor for priority sampling reward_scale (flag): to scale reward down by 10 error_clip (flag): max error to 1 seed (int): random seed """ self.useTree = use_tree self.memory = deque(maxlen=buffer_size) self.tree = SumTree(buffer_size) #create tree instance self.batch_size = batch_size self.buffer_size = buffer_size self.td_eps = td_eps self.experience = namedtuple("Experience", field_names=[ "state", "action", "reward", "next_state", "done", "td_error" ]) self.seed = random.seed(seed) self.p_replay_alpha = p_replay_alpha self.reward_scale = reward_scale self.error_clip = error_clip self.error_init = error_init self.error_max = error_max self.memory_index = np.zeros([self.buffer_size, 1]) #for quicker calculation self.memory_pointer = 0 def add(self, state, action, reward, next_state, done, td_error): """Add a new experience to memory. td_error: abs value """ #reward clipping if self.reward_scale: reward = reward / 10.0 #scale reward by factor of 10 #error clipping if self.error_clip: #error clipping td_error = np.clip(td_error, -self.error_max, self.error_max) # apply alpha power td_error = (td_error**self.p_replay_alpha) + self.td_eps # make sure experience is at least visit once if self.error_init: td_mad = np.max(self.memory_index) if td_mad == 0: td_error = self.error_max else: td_error = td_mad e = self.experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, td_error) if self.useTree: self.tree.add(td_error, e) # update the td score and experience data else: self.memory.append(e) ### memory index ### if self.memory_pointer >= self.buffer_size: #self.memory_pointer = 0 self.memory_index = np.roll(self.memory_index, -1) self.memory_index[-1] = td_error #fifo else: self.memory_index[self.memory_pointer] = td_error self.memory_pointer += 1 def update(self, td_updated, index): """ update the td error values while restoring orders td_updated: abs value; np.array of shape 1,batch_size,1 index: in case of tree, it is the leaf index """ td_updated = td_updated.squeeze() # (batch_size,) #error clipping if self.error_clip: #error clipping td_updated = np.clip(td_updated, -ERROR_MAX, ERROR_MAX) # apply alpha power td_updated = (td_updated**self.p_replay_alpha) + self.td_eps ### checking memory and memory index are sync ### #tmp_memory = copy.deepcopy(self.memory) i = 0 #while loop while i < len(index): if self.useTree: #data_index = index[i] #tree_index = data_index + self.buffer_size - 1 self.tree.update(index[i], td_updated[i]) else: self.memory.rotate( -index[i]) # move the target index to the front e = self.memory.popleft() td_i = td_updated[i].reshape(1, 1) e1 = self.experience(e.state, e.action, e.reward, e.next_state, e.done, td_i) self.memory.appendleft(e1) #append the new update self.memory.rotate(index[i]) #restore the original order ### memory index ### self.memory_index[index[i]] = td_i i += 1 #increment # make sure its updated # assert(self.memory[index[i]].td_error == self.memory_index[index[i]]) ### checking memory and memory index are sync ### #for i in range(len(self.memory)): # assert(self.memory_index[i] == self.memory[i].td_error) # if i in index: # assert(td_updated[list(index).index(i)] == self.memory[i].td_error) # else: # print(self.memory[i].td_error) # assert(tmp_memory[i].td_error == self.memory[i].td_error) def sample(self, p_replay_beta): """Sample a batch of experiences from memory.""" l = len(self.memory) p_dist = (self.memory_index[:l] / np.sum(self.memory_index[:l])).squeeze() assert (np.abs(np.sum(p_dist) - 1) < 1e-5) assert (len(p_dist) == l) # get sample of index from the p distribution sample_ind = np.random.choice(l, self.batch_size, p=p_dist) ### checking: make sure the rotation didnt screw up the memory ### #tmp_memory = copy.deepcopy(self.memory) #checking # get the selected experiences: avoid using mid list indexing es, ea, er, en, ed = [], [], [], [], [] for i in sample_ind: self.memory.rotate(-i) e = copy.deepcopy(self.memory[0]) es.append(e.state) ea.append(e.action) er.append(e.reward) en.append(e.next_state) ed.append(e.done) self.memory.rotate(i) ### checking: make sure the rotation didnt screw up the memory ### #for i in range(len(tmp_memory)): # assert(tmp_memory[i].td_error == self.memory[i].td_error) #checking states = torch.from_numpy(np.vstack(es)).float().to(device) actions = torch.from_numpy(np.vstack(ea)).long().to(device) rewards = torch.from_numpy(np.vstack(er)).float().to(device) next_states = torch.from_numpy(np.vstack(en)).float().to(device) dones = torch.from_numpy(np.vstack(ed).astype( np.uint8)).float().to(device) # for weight update adjustment selected_td_p = p_dist[sample_ind] #the prob of selected e ### checker: the mean of selected TD errors should be greater than ### checking: the mean of selected TD err are higher than memory average if p_replay_beta > 0: if np.mean(self.memory_index[sample_ind]) < np.mean( self.memory_index[:l]): print(np.mean(self.memory_index[sample_ind]), np.mean(self.memory_index[:l])) #weight = (np.array(selected_td_p) * l) ** -p_replay_beta #max_weight = (np.min(selected_td_p) * self.batch_size) ** -p_replay_beta weight = (1 / selected_td_p * 1 / l)**p_replay_beta weight = weight / np.max(weight) #normalizer by max weight = torch.from_numpy(np.array(weight)).float().to( device) #change form assert (weight.requires_grad == False) return (states, actions, rewards, next_states, dones), weight, sample_ind def sample_tree(self, p_replay_beta): # Create a sample array that will contains the minibatch e_s, e_a, e_r, e_n, e_d = [], [], [], [], [] sample_ind = np.empty((self.batch_size, ), dtype=np.int32) sampled_td_score = np.empty((self.batch_size, 1)) weight = np.empty((self.batch_size, 1)) # Calculate the priority segment # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges td_score_segment = self.tree.total_td_score / self.batch_size # priority segment i = 0 #use while loop while i < self.batch_size: """ A value is uniformly sample from each range """ a, b = td_score_segment * i, td_score_segment * (i + 1) value = np.random.uniform(a, b) """ Experience that correspond to each value is retrieved """ leaf_index, td_score, data = self.tree.get_leaf(value) #P(j) sampling_p = td_score / self.tree.total_td_score sampled_td_score[i, 0] = td_score # IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b /max wi weight[i, 0] = (1 / self.buffer_size * 1 / sampling_p)**p_replay_beta sample_ind[i] = leaf_index e_s.append(data.state) e_a.append(data.action) e_r.append(data.reward) e_n.append(data.next_state) e_d.append(data.done) i += 1 # increment # Calculating the max_weight """ p_min = np.min(self.tree.tree[-self.buffer_size:]) / self.tree.total_td_score if p_min == 0: p_min = self.td_eps # avoid div by zero max_weight = (1/p_min * 1/self.buffer_size) ** (p_replay_beta) """ # apply max weigth adjustment max_weight = np.max(weight) weight = weight / max_weight #assert(np.mean(sampled_td_score) >= np.mean(self.tree.tree[-self.buffer_size:])) states = torch.from_numpy(np.vstack(e_s)).float().to(device) actions = torch.from_numpy(np.vstack(e_a)).long().to(device) rewards = torch.from_numpy(np.vstack(e_r)).float().to(device) next_states = torch.from_numpy(np.vstack(e_n)).float().to(device) dones = torch.from_numpy(np.vstack(e_d).astype( np.uint8)).float().to(device) weight = torch.from_numpy(weight).float().to(device) #change form assert (weight.requires_grad == False) return (states, actions, rewards, next_states, dones), weight, sample_ind def __len__(self): """Return the current size of internal memory.""" return len(self.memory)
class ReplayBuffer: def __init__(self, buffer_size, num_agents, state_size, action_size, use_PER=False): self.buffer_size = buffer_size self.use_PER = use_PER self.num_agents = num_agents self.state_size = state_size self.action_size = action_size if use_PER: self.tree = SumTree(buffer_size) #create tree instance else: self.memory = deque(maxlen=buffer_size) self.buffer_size = buffer_size self.leaves_count = 0 def add_tree(self, data, td_default=1.0): """PER function. Add a new experience to memory. td_error: abs value""" td_max = np.max(self.tree.tree[-self.buffer_size:]) if td_max == 0.0: td_max = td_default self.tree.add(td_max, data) #increase chance to be selected self.leaves_count = min(self.leaves_count+1,self.buffer_size) def add(self, data): """add into the buffer""" self.memory.append(data) def sample_tree(self, batch_size, p_replay_beta, td_eps=1e-4): """PER function. Segment piece wise sampling""" s_samp, a_samp, r_samp, d_samp, ns_samp = ([] for l in range(5)) sample_ind = np.empty((batch_size,), dtype=np.int32) weight = np.empty((batch_size, 1)) # create segments according to td score range td_score_segment = self.tree.total_td_score / batch_size for i in range(batch_size): # A value is uniformly sample from each range _start, _end = i * td_score_segment, (i+1) * td_score_segment value = np.random.uniform(_start, _end) # get the experience with the closest value in that segment leaf_index, td_score, data = self.tree.get_leaf(value) # the sampling prob for this sample across all tds sampling_p = td_score / self.tree.total_td_score # apply weight adjustment weight[i,0] = (1/sampling_p * 1/self.leaves_count)**p_replay_beta sample_ind[i] = leaf_index s_samp.append(data.states) a_samp.append(data.actions) r_samp.append(data.rewards) d_samp.append(data.dones) ns_samp.append(data.next_states) # Calculating the max_weight among entire memory #p_max = np.max(self.tree.tree[-self.buffer_size:]) / self.tree.total_td_score #if p_max == 0: p_max = td_eps # avoid div by zero #max_weight_t = (1/p_max * 1/self.leaves_count)**p_replay_beta #max_weight = np.max(weight) weight_n = toTorch(weight) #normalize weight /max_weight return (s_samp, a_samp, r_samp, d_samp, ns_samp, weight_n, sample_ind) def sample(self, batch_size): """sample from the buffer""" sample_ind = np.random.choice(len(self.memory), batch_size) s_samp, a_samp, r_samp, d_samp, ns_samp = ([] for l in range(5)) i = 0 while i < batch_size: #while loop is faster self.memory.rotate(-sample_ind[i]) e = self.memory[0] s_samp.append(e.states) a_samp.append(e.actions) r_samp.append(e.rewards) d_samp.append(e.dones) ns_samp.append(e.next_states) self.memory.rotate(sample_ind[i]) i += 1 # last 2 values for functions compatibility with PER return (s_samp, a_samp, r_samp, d_samp, ns_samp, 1.0, []) def update_tree(self, td_updated, index, p_replay_alpha, td_eps=1e-4): """ PER function. update the td error values while restoring orders td_updated: abs value; np.array of shape 1,batch_size,1 index: in case of tree, it is the leaf index """ # apply alpha power td_updated = (td_updated.squeeze() ** p_replay_alpha) + td_eps for i in range(len(index)): self.tree.update(index[i], td_updated[i]) def __len__(self): if not self.use_PER: return len(self.memory) else: return self.leaves_count