예제 #1
0
class PrioritizedReplayMemory:
    e = 0.01
    a = 0.6

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def _getPriority(self, error):
        return (error + self.e) ** self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i+1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append( (idx, data) )

        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)

    def isFull(self):
        return self.tree.isFull()
예제 #2
0
파일: memory.py 프로젝트: UesugiErii/lake
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01
    a = 0.6

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))

        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)
예제 #3
0
class PrioritizedER:
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (abs(error) + self.e)**self.a

    def push(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            if data == 0:
                p = priorities[-1]
                data = batch[-1]
                idx = idxs[-1]
                print(
                    'WARNING: transition value was 0, replaced it with the previous sampled transition'
                )
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = (priorities / self.tree.total()) + 10e-5
        is_weight = np.power(self.tree.n_entries * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

    def __len__(self):
        return self.tree.n_entries
예제 #4
0
class Memory:
    """
    Stores transitions as (s, a, r, s_, done) tuples using a SumTree.
    Each sample is assigned a priority which affects retrieval
    """
    def __init__(self, capacity, e=0.01, a=0.6):
        """        
        :param capacity: The maximum number of samples that can be stored
        :param e: Ensures that no sample has 0 priority
        :param a: 
        """
        self.capacity = capacity
        self.e = e
        self.a = a

        self.tree = SumTree(capacity)

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def add(self, error, sample):
        """
        Adds a new sample to the buffer
        :param error: The error associated with the sample
        :param sample: The sample to add 
        """
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        """
        Returns n samples from the buffer
        :param n: The number of samples to return
        """
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))

        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)
예제 #5
0
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.max_p = 1
        self.e = 0.0
        self.a = 0.6

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def length(self):
        return self.tree.write

    def add(self, sample, error):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def add_p(self, p, sample):
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        idx_batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append(data)
            idx_batch.append(idx)

        return batch, idx_batch

    def update(self, idx, error):
        p = self._getPriority(error)
        if p > self.max_p:
            self.max_p = p
        self.tree.update(idx, p)

    def update_batch(self, idx_batch, error_batch):
        p_batch = self._getPriority(error_batch)
        if np.max(p_batch) > self.max_p:
            self.max_p = np.max(p_batch)
        self.tree.update_batch(idx_batch, p_batch)
예제 #6
0
class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    epsilon = 0.01  # small amount to avoid zero priority
    alpha = 0.6  # [0~1] convert the importance of TD error to priority
    beta = 0.4  # importance-sampling, from initial value increasing to 1
    beta_increment_per_sampling = 0.001
    abs_err_upper = 1.  # clipped abs error

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def store(self, transition):
        max_p = np.max(self.tree.tree[-self.tree.capacity:])
        if max_p == 0:
            max_p = self.abs_err_upper
        self.tree.add(max_p, transition)   # set the max p for new p

    def sample(self, n):
        # b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1))
        b_idx, b_memory, ISWeights = deque(), deque(), deque()
        pri_seg = self.tree.total_p / n       # priority segment
        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1

        max_prob = np.max(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
        for i in range(n):
            a, b = pri_seg * i, pri_seg * (i + 1)
            v = np.random.uniform(a, b)
            idx, p, data = self.tree.get_leaf(v)
            prob = p / self.tree.total_p
            # ISWeights[i, 0] = np.power(prob/max_prob, -self.beta)
            ISWeights.append(np.power(prob/max_prob, -self.beta))
            # b_idx[i], b_memory[i, :] = idx, data
            b_idx.append(idx)
            b_memory.append(data)
        return np.array(list(b_idx)), np.array(list(b_memory)), np.reshape(np.array(list(ISWeights)), (n, 1))

    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.epsilon  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
        ps = np.power(clipped_errors, self.alpha)
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)
예제 #7
0
class Memory:
    # Constants
    e = 0.01
    a = 0.0  #0.6

    # Initialize memory
    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity
        self.len = 0

    # Calculate error priority
    def getPriority(self, error):
        return (error + self.e)**self.a

    # Add sample to the memory
    def add(self, error, sample):
        p = self.getPriority(error)
        self.tree.add(p, sample)
        self.len = min(self.len + 1, self.capacity)

    # Generate 'n' random samples from the memory
    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))

        return batch

    # Number of current samples in memory
    def numberSamples(self):
        return self.len

    # Update priority of error
    def update(self, idx, error):
        p = self.getPriority(error)
        self.tree.update(idx, p)
예제 #8
0
class PrioritizedMemory(Memory):
    def __init__(self,
                 capacity,
                 epsilon=0.01,
                 alpha=0.6,
                 beta=0.4,
                 beta_increment=0.001):
        self.epsilon = epsilon
        self.alpha = alpha
        self.beta = beta
        self.beta_increment = beta_increment

        self.capacity = capacity
        self.tree = SumTree(self.capacity)

    def _compute_priority(self, loss):
        return (np.abs(loss) + self.epsilon)**self.alpha

    def push(self, *args):
        priority = self.tree.max()
        priority = 1 if priority <= 0 else priority
        self.tree.add(priority, Transition(*args))

    def sample(self, batch_size):
        batch = []
        indices = []
        weights = np.empty(batch_size, dtype='float32')
        self.beta += self.beta_increment
        beta = np.minimum(1., self.beta)
        total = self.tree.total()
        for i, r in enumerate(np.random.uniform(0, total, (batch_size, ))):
            index, priority, data = self.tree.get(r)
            batch.append(data)
            indices.append(index)
            weights[i] = (self.capacity * priority / total)**(-beta)

        return batch, indices, weights / weights.max()

    def update(self, index, loss):
        priority = self._compute_priority(loss)
        self.tree.update(index, priority)

    def __len__(self):
        return self.tree.n_entries
예제 #9
0
class Replay_Memory:
    def __init__(self):
        self.memory_len = 10000
        self.memory_bias = .01
        self.memory_pow = .6
        self.tree = SumTree(self.memory_len)

    def add(self, error, sample):
        priority = (error + self.memory_bias)**self.memory_pow
        self.tree.add(priority, sample)

    def sample(self, batch_size):
        """
         Get a sample batch of the replay memory
        Returns:
         batch: a batch with one sample from each segment of the memory
        """
        batch = []
        #we want one representative of all distribution-segments in the batch
        #e.g BATCH_SIZE=2: batch contains one sample from [min,median]
        #and from [median,max]
        segment = self.tree.total() / batch_size
        for i in range(batch_size):
            minimum = segment * i
            maximum = segment * (i + 1)
            s = random.uniform(minimum, maximum)
            (idx, _, data) = self.tree.get(s)
            batch.append((idx, data))
        return batch

    def update(self, idx, error):
        """
         Updates one entry in the replay memory
        Args:
         idx: the position of the outdated transition in the memory
         error: the newly calculated error
        """
        priority = (error + self.memory_bias)**self.memory_pow
        self.tree.update(idx, priority)
예제 #10
0
class PERMemory(ReplayMemory):
    epsilon = 0.0001
    alpha = 0.6

    def __init__(self, CAPACITY):
        super(PERMemory, self).__init__(CAPACITY)
        self.tree = SumTree(CAPACITY)
        self.size = 0

    # Proportional prioritizationによるpriorityの計算
    def _getPriority(self, td_error):
        return (td_error + self.epsilon)**self.alpha

    def push(self, state, action, state_next, reward):
        """state, action, state_next, rewardをメモリに保存します"""
        self.size += 1

        priority = self.tree.max()
        if priority <= 0:
            priority = 1

        self.tree.add(priority, Transition(state, action, state_next, reward))

    def sample(self, batch_size):
        data_list = []
        indexes = []
        for rand in np.random.uniform(0, self.tree.total(), batch_size):
            (idx, _, data) = self.tree.get(rand)
            data_list.append(data)
            indexes.append(idx)

        return data_list, indexes

    def update(self, idx, td_error):
        priority = self._getPriority(td_error)
        self.tree.update(idx, priority)

    def __len__(self):
        return self.size
예제 #11
0
class PriorityExperienceReplay:
    '''
    Almost copy from
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    '''
    def __init__(self, max_size, window_size, input_shape):

        # set default sumtree
        self.tree = SumTree(max_size)
        self._max_size = max_size

        # dimension for how to store state and next state
        self._window_size = window_size
        self._WIDTH = input_shape[0]
        self._HEIGHT = input_shape[1]

        # hyperparmeters for priority probability
        self.e = 0.01
        self.a = 0.6

    def _getPriority(self, error):

        # set probability for given experience
        return (error + self.e)**self.a

    def append(self, state, action, reward, next_state, done):

        # add experience to tree with probability computed
        for s, a, r, n_s, d in zip(state, action, reward, next_state, done):

            # when first appended, set maximum priority
            # 0.5 is the maximum error
            p = self._getPriority(0.5)

            self.tree.add(p, data=(s, a, r, n_s, d))

    def sample(self, batch_size, indexes=None):

        # set batch for data, index and priority
        data_batch = []
        idx_batch = []
        p_batch = []

        # split the tree into batch size
        segment = self.tree.total_and_count()[0] / batch_size

        # search for high priority
        # divide into multiple section in tree to search for diverse, yet high priority sampels
        for i in range(batch_size):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            data_batch.append(data)
            idx_batch.append(idx)
            p_batch.append(p)

        zipped = list(zip(*data_batch))
        zipped[0] = np.reshape(
            zipped[0], (-1, self._WIDTH, self._HEIGHT, self._window_size))
        zipped[3] = np.reshape(
            zipped[3], (-1, self._WIDTH, self._HEIGHT, self._window_size))

        sum_p, count = self.tree.total_and_count()
        return zipped, idx_batch, p_batch, sum_p, count

    def update(self, idx_list, error_list):

        # update priority according to td error from current network
        # repeat after every training step
        for idx, error in zip(idx_list, error_list):
            p = self._getPriority(error)
            self.tree.update(idx, p)
class MemoryBuffer(object):
    """ Memory Buffer Helper class for Experience Replay
    using a double-ended queue or a Sum Tree (for PER)
    """
    def __init__(self, buffer_size, with_per = False):
        """ Initialization
        """
        if(with_per):
            # Prioritized Experience Replay
            self.alpha = 0.5
            self.epsilon = 0.01
            self.buffer = SumTree(buffer_size)
        else:
            # Standard Buffer
            self.buffer = deque()
        self.count = 0
        self.with_per = with_per
        self.buffer_size = buffer_size

    def memorize(self, state, action, reward, done, new_state, achieved_goal, goal, error=None):
        """ Save an experience to memory, optionally with its TD-Error
        """
        experience = (state, action, reward, done, new_state, achieved_goal, goal, error)
        if(self.with_per):
            priority = self.priority(error[0])
            self.buffer.add(priority, experience)
            self.count += 1
        else:
            # Check if buffer is already full
            if self.count < self.buffer_size:
                self.buffer.append(experience)
                self.count += 1
            else:
                self.buffer.popleft()
                self.buffer.append(experience)


    def priority(self, error):
        """ Compute an experience priority, as per Schaul et al.
        """
        return (error + self.epsilon) ** self.alpha

    def size(self):
        """ Current Buffer Occupation
        """
        return self.count

    def sample_batch(self, batch_size):
        """ Sample a batch, optionally with (PER)
        """
        batch = []
        # Sample using prorities
        if(self.with_per):
            T = self.buffer.total() // batch_size
            for i in range(batch_size):
                a, b = T * i, T * (i + 1)
                s = random.uniform(a, b)
                idx, error, data = self.buffer.get(s)
                batch.append((*data, idx))
            idx = np.array([i[7] for i in batch])
        # Sample randomly from Buffer
        elif self.count < batch_size:
            idx = None
            batch = random.sample(self.buffer, self.count)
        else:
            idx = None
            batch = random.sample(self.buffer, batch_size)

        # Return a batch of experience
        s_batch = np.array([i[0] for i in batch])
        a_batch = np.array([i[1] for i in batch])
        r_batch = np.array([i[2] for i in batch])
        d_batch = np.array([i[3] for i in batch])
        new_s_batch = np.array([i[4] for i in batch])
        ag_batch = np.array([i[5] for i in batch])
        g_batch = np.array([i[6] for i in batch])
        return s_batch, a_batch, r_batch, d_batch, new_s_batch, ag_batch, g_batch, idx

    def update(self, idx, new_error):
        """ Update priority for idx (PER)
        """
        self.buffer.update(idx, self.priority(new_error))

    def clear(self):
        """ Clear buffer / Sum Tree
        """
        if(self.with_per): self.buffer = SumTree(buffer_size)
        else: self.buffer = deque()
        self.count = 0
예제 #13
0
파일: draft.py 프로젝트: Sjtugjl/AU332DQN
from sumtree import SumTree




tree = SumTree(memory_size=10)
p = 1
for i in range(p):
    tree.add(10000, (1, 1, 1, 1, 1))
print("tree",tree.tree)
print("transition",tree.transitions)
예제 #14
0
class PriorityReplayBuffer(object):
    # TODO: reference https://github.com/rlcode/per/blob/master/prioritized_memory.py

    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001
    absolute_error_upper = 1.

    def __init__(self, capacity):
        '''
        Initializes PRB.

        Args:
            capacity: capacity of backing SumTree
        '''
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        '''
        Gets the priority associated with the error.

        Args:
            error: input error
        Returns:
            the associated priority
        '''
        priority = np.abs(error) + self.e
        priority = np.minimum(priority, self.absolute_error_upper)
        return priority**self.a

    def add(self, error, experience):
        '''
        Adds the experience and error to the SumTree.

        Args:
            error: TD error of the sample
            sample: experience to enter
        '''
        priority = self._get_priority(error)
        self.tree.add(experience, priority)

    def sample(self, size):
        '''
        Returns a sample with given size following weighted distribution.

        Args:
            size: the desired batch size to receive
        Returns:
            the batch of experiences, indexes, and importance sampling weights
        '''
        batch = []
        idxs = []
        segment = self.tree.total_priority() / size
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
        # p_min = np.min(self.tree.tree[-self.tree.capacity:])
        # max_weight = (p_min * size) ** (-self.beta)

        for i in range(size):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get_leaf(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total_priority()
        is_weight = np.power(self.tree.size * sampling_probabilities,
                             -self.beta)
        is_weight /= is_weight.max()

        return batch, idxs, is_weight

    def update(self, idx, error):
        '''
        Updates the tree index with the error.
        
        Args:
            idx: the SumTree index to update
            error: the error of the experience
        '''
        p = self._get_priority(error)
        self.tree.update(idx, p)
예제 #15
0
class Memory:  # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01
    a = 0.6
    PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
    PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
    PER_b = 0.4  # importance-sampling, from initial value increasing to 1

    PER_b_increment_per_sampling = 0.001
    absolute_error_upper = 1.

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def _getPriority(self, error):
        return (error + self.e)**self.a

    def add(self, error, sample):
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])

        if max_priority == 0:
            max_priority = absolute_error_upper

        self.tree.add(self._getPriority(max_priority), sample)

    def sample(self, n):
        batch = []
        # calculate priority segment
        segment = self.tree.total() / n

        #b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n,), dtype=np.float32)

        # Calculate the priority segment
        # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
        #priority_segment = self.tree.total_priority / n       # priority segment

        # Here we increasing the PER_b each time we sample a new minibatch
        self.PER_b = np.min(
            [1., self.PER_b + self.PER_b_increment_per_sampling])  # max = 1

        # Calculating the max_weight
        p_min = np.min(
            self.tree.tree[-self.tree.capacity:]) / self.tree.total()
        max_weight = (p_min * n)**(-self.PER_b)

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)

            sampling_probabilities = p / self.tree.total()

            #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            is_weights = np.power(n * sampling_probabilities,
                                  -self.PER_b) / max_weight

            batch.append((idx, data, is_weights))

        return batch

    def update(self, idx, error):
        p = self._getPriority(error)
        self.tree.update(idx, p)
예제 #16
0
class PrioritizedReplayBuffer:
    e = 1e-5
    alpha = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, buffer_size, batch_size, seed):
        self.batch_size = batch_size
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

        self.tree = SumTree(buffer_size)

    def _get_priority(self, error):
        return (error + self.e)**self.alpha

    def add(self, error, state, action, reward, next_state, done):
        sample = self.experience(state, action, reward, next_state, done)
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def sample(self):
        experiences = []
        indecis = []
        segment = self.tree.total() / self.batch_size

        priorities = []
        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(self.batch_size):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)

            if len(data) < 5:
                raise ValueError("missed data")
            priorities.append(p)
            experiences.append(data)
            indecis.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weights = np.power(self.tree.n_entries * sampling_probabilities,
                              -self.beta)
        is_weights /= is_weights.max()

        is_weights = torch.from_numpy(
            np.vstack([w for w in is_weights
                       if w is not None])).float().to(device)
        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       if e is not None]).astype(np.uint8)).float().to(device)

        return (indecis, is_weights, states, actions, rewards, next_states,
                dones)

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)
예제 #17
0
class MemoryBuffer(object):
    """ Memory Buffer Helper class for Experience Replay
    using a double-ended queue or a Sum Tree (for PER)
    """
    def __init__(self, buffer_size, with_per = False):
        """ Initialization
        """
        if(with_per):
            # Prioritized Experience Replay
            self.alpha = 0.5
            self.epsilon = 0.01
            self.buffer = SumTree(buffer_size)
        else:
            # Standard Buffer
            self.buffer = deque()
        self.count = 0
        self.with_per = with_per
        self.buffer_size = buffer_size

    def memorize(self, state, action, reward, done, new_state, error):
        """ Save an experience to memory, optionally with its TD-Error
        """

        experience = (state, action, reward, done, new_state)
        if(self.with_per):
            priority = self.priority(error[0])
            self.buffer.add(priority, experience)
            self.count += 1

    def priority(self, error):
        """ Compute an experience priority, as per Schaul et al.
        """
        return (error + self.epsilon) ** self.alpha

    def size(self):
        """ Current Buffer Occupation
        """
        return self.count

    def sample_batch(self, batch_size):
        """ Sample a batch, optionally with (PER)
        """
        batch = []
        s_batch = []
        a_batch = []
        r_batch = []
        d_batch = []
        new_s_batch = []
        idx_ = []
        # Sample using prorities
        if(self.with_per):
            T = self.buffer.total() // batch_size
            for i in range(batch_size):
                sum = []
                a, b = T * i, T * (i + 1)
                s = random.uniform(a, b)
                idx, error, data = self.buffer.get(s)
                # batch.append((*data, idx))
                s_batch.append(data[0])
                a_batch.append(data[1])
                r_batch.append(data[2])
                d_batch.append(data[3])
                new_s_batch.append(data[4])
                idx_.append(idx)
                # batch.append((data, idx))

            # print batch
            # idx = np.array([i[5] for i in batch])

        # Return a batch of experience
        # s_batch = np.array([i[0] for i in batch])
        # a_batch = np.array([i[1] for i in batch])
        # r_batch = np.array([i[2] for i in batch])
        # d_batch = np.array([i[3] for i in batch])
        # new_s_batch = np.array([i[4] for i in batch])
        # idx = np.array([i[5] for i in batch])
        return s_batch, a_batch, r_batch, d_batch, new_s_batch, idx_

    def update(self, idx, new_error):
        """ Update priority for idx (PER)
        """
        self.buffer.update(idx, self.priority(new_error))

    def clear(self):
        """ Clear buffer / Sum Tree
        """
        if(self.with_per): self.buffer = SumTree(buffer_size)
        else: self.buffer = deque()
        self.count = 0
예제 #18
0
    def train(self):
        if self.params == 1:
            top_performers = []
            gen_performance = []
            for gen in range(self.generations):
                performance = SumTree(self.size)
                hp1 = np.random.randint(
                    low=self.param_one[0],
                    high=self.param_one[1],
                    size=(self.size - len(top_performers)) * 2)
                hps = np.append(
                    np.array(top_performers).reshape((-1, 2)),
                    hp1.reshape((-1, 2)))
                for hp in hps:  # train all models and save performance
                    print(hp)
                    temp = self.model(self.x_train, self.y_train)
                    temp.build(np.mean(np.array([hp[0], hp[1]])))
                    temp.train(self.epochs)
                    metric = temp.evaluate(self.x_test, self.y_test)
                    performance.add(metric, np.array([hp]))
                keep_metrics = np.sort(performance.p_array(
                ))[-int(self.keep):]  # array of the highest performing metrics
                hyperparameters = [
                ]  # array to store the best n=self.keep performing hyperparameters
                for metric in keep_metrics:  # note that the order of keep metrics is lowest to highest performance
                    _, __, hp_temp = performance.get(metric)
                    hyperparameters = np.append(np.array(hyperparameters),
                                                hp_temp)
                hyperparameters = hyperparameters.reshape((-1, 4))
                mated_hp1 = []
                for mate in hyperparameters:  # mating routine with mendellian inheritance from the two alleles
                    mated_hp1.append(np.random.choice(mate[np.array([0, 1])]))
                    mated_hp1.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))]))
                top_performers = np.array(mated_hp1).reshape((-1, 2))
                print("generation:", gen,
                      "  min performance (params, metric):",
                      hyperparameters[0], keep_metrics[0],
                      "  max performance:", hyperparameters[-1],
                      keep_metrics[-1])
                gen_performance.append(keep_metrics[0])
                gen_performance.append(keep_metrics[-1])
            self.hyperparameters = hyperparameters
            self.keep_metrics = keep_metrics
            return (hyperparameters, keep_metrics,
                    np.array(gen_performance).reshape(-1, 2))

        if self.params == 2:
            top_performers = []
            gen_performance = []
            os.mkdir("temp")
            for gen in range(self.generations):
                performance = SumTree(self.size)
                hp1 = np.random.randint(
                    low=self.param_one[0],
                    high=self.param_one[1],
                    size=(self.size - len(top_performers)) * 2)
                hp2 = np.random.randint(
                    low=self.param_two[0],
                    high=self.param_two[1],
                    size=(self.size - len(top_performers)) * 2)
                hps = np.append(
                    np.array(top_performers).reshape((-1, 4)),
                    np.dstack((hp1, hp2))).reshape((-1, 4))
                for hp in hps:  # train all models and save performance
                    print(hp)
                    temp = self.model(self.x_train, self.y_train)
                    temp.build(np.mean(np.array([hp[0], hp[2]])),
                               np.mean(np.array([hp[1], hp[3]])))
                    temp.train(self.epochs)
                    metric = temp.evaluate(self.x_test, self.y_test)
                    performance.add(metric, np.array([hp]))
                keep_metrics = np.sort(performance.p_array(
                ))[-int(self.keep):]  # array of the highest performing metrics
                hyperparameters = [
                ]  # array to store the best n=self.keep performing hyperparameters
                for metric in keep_metrics:  # note that the order of keep metrics is lowest to highest performance
                    _, __, hp_temp = performance.get(metric)
                    hyperparameters = np.append(np.array(hyperparameters),
                                                hp_temp)
                hyperparameters = hyperparameters.reshape((-1, 4))
                mated_hp1 = []
                mated_hp2 = []
                for mate in hyperparameters:  # mating routine with mendellian inheritance from the two alleles
                    mated_hp1.append(np.random.choice(mate[np.array([0, 2])]))
                    mated_hp1.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([0, 2])]))
                    mated_hp2.append(np.random.choice(mate[np.array([1, 3])]))
                    mated_hp2.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([1, 3])]))
                top_performers = np.dstack(
                    (np.array(mated_hp1), np.array(mated_hp2))).reshape(
                        (-1, 4))
                print("generation:", gen,
                      "  min performance (params, metric):",
                      hyperparameters[0], keep_metrics[0],
                      "  max performance:", hyperparameters[-1],
                      keep_metrics[-1])
                gen_performance.append([keep_metrics[0], keep_metrics[-1]])
                np.savetxt("temp/gen_perf.csv",
                           np.array(gen_performance).reshape(-1, 2),
                           delimiter=",")
            os.remove("temp/gen_perf.csv")
            os.rmdir("temp")
            self.hyperparameters = hyperparameters
            self.keep_metrics = keep_metrics
            return (hyperparameters, keep_metrics,
                    np.array(gen_performance).reshape(-1, 2))

        if self.params == 3:
            top_performers = []
            gen_performance = []
            for gen in range(self.generations):
                performance = SumTree(self.size)
                hp1 = np.random.randint(
                    low=self.param_one[0],
                    high=self.param_one[1],
                    size=(self.size - len(top_performers)) * 2)
                hp2 = np.random.randint(
                    low=self.param_two[0],
                    high=self.param_two[1],
                    size=(self.size - len(top_performers)) * 2)
                hp3 = np.random.randint(
                    low=self.param_three[0],
                    high=self.param_three[1],
                    size=(self.size - len(top_performers)) * 2)
                hps = np.append(
                    np.array(top_performers).reshape((-1, 6)),
                    np.dstack((hp1, hp2, hp3))).reshape((-1, 6))
                for hp in hps:  # train all models and save performance
                    print(hp)
                    temp = self.model(self.x_train, self.y_train)
                    temp.build(np.mean(np.array([hp[0], hp[3]])),
                               np.mean(np.array([hp[1], hp[4]])),
                               np.mean(np.array([hp[2], hp[5]])))
                    temp.train(self.epochs)
                    metric = temp.evaluate(self.x_test, self.y_test)
                    performance.add(metric, np.array([hp]))
                keep_metrics = np.sort(performance.p_array(
                ))[-int(self.keep):]  # array of the highest performing metrics
                hyperparameters = [
                ]  # array to store the best n=self.keep performing hyperparameters
                for metric in keep_metrics:  # note that the order of keep metrics is lowest to highest performance
                    _, __, hp_temp = performance.get(metric)
                    hyperparameters = np.append(np.array(hyperparameters),
                                                hp_temp)
                hyperparameters = hyperparameters.reshape((-1, 6))
                mated_hp1 = []
                mated_hp2 = []
                mated_hp3 = []
                for mate in hyperparameters:  # mating routine with mendellian inheritance from the two alleles
                    mated_hp1.append(np.random.choice(mate[np.array([0, 3])]))
                    mated_hp1.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([0, 3])]))
                    mated_hp2.append(np.random.choice(mate[np.array([1, 4])]))
                    mated_hp2.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([1, 4])]))
                    mated_hp3.append(np.random.choice(mate[np.array([2, 5])]))
                    mated_hp3.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([2, 5])]))
                top_performers = np.dstack(
                    (np.array(mated_hp1), np.array(mated_hp2),
                     np.array(mated_hp3))).reshape((-1, 6))
                print("generation:", gen,
                      "  min performance (params, metric):",
                      hyperparameters[0], keep_metrics[0],
                      "  max performance:", hyperparameters[-1],
                      keep_metrics[-1])
                gen_performance.append([keep_metrics[0], keep_metrics[-1]])
            self.hyperparameters = hyperparameters
            self.keep_metrics = keep_metrics
            return (hyperparameters, keep_metrics,
                    np.array(gen_performance).reshape(-1, 2))

        if self.params == 4:
            top_performers = []
            gen_performance = []
            for gen in range(self.generations):
                performance = SumTree(self.size)
                hp1 = np.random.randint(
                    low=self.param_one[0],
                    high=self.param_one[1],
                    size=(self.size - len(top_performers)) * 2)
                hp2 = np.random.randint(
                    low=self.param_two[0],
                    high=self.param_two[1],
                    size=(self.size - len(top_performers)) * 2)
                hp3 = np.random.randint(
                    low=self.param_three[0],
                    high=self.param_three[1],
                    size=(self.size - len(top_performers)) * 2)
                hp4 = np.random.randint(
                    low=self.param_four[0],
                    high=self.param_four[1],
                    size=(self.size - len(top_performers)) * 2)
                hps = np.append(
                    np.array(top_performers).reshape((-1, 8)),
                    np.dstack((hp1, hp2, hp3, hp4))).reshape((-1, 8))
                for hp in hps:  # train all models and save performance
                    print(hp)
                    temp = self.model(self.x_train, self.y_train)
                    temp.build(np.mean(np.array([hp[0], hp[4]])),
                               np.mean(np.array([hp[1], hp[5]])),
                               np.mean(np.array([hp[2], hp[6]])),
                               np.mean(np.array([hp[3], hp[7]])))
                    temp.train(self.epochs)
                    metric = temp.evaluate(self.x_test, self.y_test)
                    performance.add(metric, np.array([hp]))
                keep_metrics = np.sort(performance.p_array(
                ))[-int(self.keep):]  # array of the highest performing metrics
                hyperparameters = [
                ]  # array to store the best n=self.keep performing hyperparameters
                for metric in keep_metrics:  # note that the order of keep metrics is lowest to highest performance
                    _, __, hp_temp = performance.get(metric)
                    hyperparameters = np.append(np.array(hyperparameters),
                                                hp_temp)
                hyperparameters = hyperparameters.reshape((-1, 8))
                mated_hp1 = []
                mated_hp2 = []
                mated_hp3 = []
                mated_hp4 = []
                for mate in hyperparameters:  # mating routine with mendellian inheritance from the two alleles
                    mated_hp1.append(np.random.choice(mate[np.array([0, 4])]))
                    mated_hp1.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([0, 4])]))
                    mated_hp2.append(np.random.choice(mate[np.array([1, 5])]))
                    mated_hp2.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([1, 5])]))
                    mated_hp3.append(np.random.choice(mate[np.array([2, 6])]))
                    mated_hp3.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([2, 6])]))
                    mated_hp4.append(np.random.choice(mate[np.array([3, 7])]))
                    mated_hp4.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([3, 7])]))
                top_performers = np.dstack(
                    (np.array(mated_hp1), np.array(mated_hp2),
                     np.array(mated_hp3), np.array(mated_hp4))).reshape(
                         (-1, 8))
                print("generation:", gen,
                      "  min performance (params, metric):",
                      hyperparameters[0], keep_metrics[0],
                      "  max performance:", hyperparameters[-1],
                      keep_metrics[-1])
                gen_performance.append([keep_metrics[0], keep_metrics[-1]])
            self.hyperparameters = hyperparameters
            self.keep_metrics = keep_metrics
            return (hyperparameters, keep_metrics,
                    np.array(gen_performance).reshape(-1, 2))

        if self.params == 5:
            top_performers = []
            gen_performance = []
            for gen in range(self.generations):
                performance = SumTree(self.size)
                hp1 = np.random.randint(
                    low=self.param_one[0],
                    high=self.param_one[1],
                    size=(self.size - len(top_performers)) * 2)
                hp2 = np.random.randint(
                    low=self.param_two[0],
                    high=self.param_two[1],
                    size=(self.size - len(top_performers)) * 2)
                hp3 = np.random.randint(
                    low=self.param_three[0],
                    high=self.param_three[1],
                    size=(self.size - len(top_performers)) * 2)
                hp4 = np.random.randint(
                    low=self.param_four[0],
                    high=self.param_four[1],
                    size=(self.size - len(top_performers)) * 2)
                hp5 = np.random.randint(
                    low=self.param_five[0],
                    high=self.param_five[1],
                    size=(self.size - len(top_performers)) * 2)
                hps = np.append(
                    np.array(top_performers).reshape((-1, 10)),
                    np.dstack((hp1, hp2, hp3, hp4, hp5))).reshape((-1, 10))
                for hp in hps:  # train all models and save performance
                    print(hp)
                    temp = self.model(self.x_train, self.y_train)
                    temp.build(np.mean(np.array([hp[0], hp[5]])),
                               np.mean(np.array([hp[1], hp[6]])),
                               np.mean(np.array([hp[2], hp[7]])),
                               np.mean(np.array([hp[3], hp[8]])),
                               np.mean(np.array([hp[4], hp[9]])))
                    temp.train(self.epochs)
                    metric = temp.evaluate(self.x_test, self.y_test)
                    performance.add(metric, np.array([hp]))
                keep_metrics = np.sort(performance.p_array(
                ))[-int(self.keep):]  # array of the highest performing metrics
                hyperparameters = [
                ]  # array to store the best n=self.keep performing hyperparameters
                for metric in keep_metrics:  # note that the order of keep metrics is lowest to highest performance
                    _, __, hp_temp = performance.get(metric)
                    hyperparameters = np.append(np.array(hyperparameters),
                                                hp_temp)
                hyperparameters = hyperparameters.reshape((-1, 10))
                mated_hp1 = []
                mated_hp2 = []
                mated_hp3 = []
                mated_hp4 = []
                mated_hp5 = []
                for mate in hyperparameters:  # mating routine with mendellian inheritance from the two alleles
                    mated_hp1.append(np.random.choice(mate[np.array([0, 5])]))
                    mated_hp1.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([0, 5])]))
                    mated_hp2.append(np.random.choice(mate[np.array([1, 6])]))
                    mated_hp2.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([1, 6])]))
                    mated_hp3.append(np.random.choice(mate[np.array([2, 7])]))
                    mated_hp3.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([2, 7])]))
                    mated_hp4.append(np.random.choice(mate[np.array([3, 8])]))
                    mated_hp4.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([3, 8])]))
                    mated_hp5.append(np.random.choice(mate[np.array([4, 9])]))
                    mated_hp5.append(
                        np.random.choice(hyperparameters[np.random.randint(
                            len(hyperparameters))][np.array([4, 9])]))
                top_performers = np.dstack(
                    (np.array(mated_hp1), np.array(mated_hp2),
                     np.array(mated_hp3), np.array(mated_hp4),
                     np.array(mated_hp5))).reshape((-1, 10))
                print("generation:", gen,
                      "  min performance (params, metric):",
                      hyperparameters[0], keep_metrics[0],
                      "  max performance:", hyperparameters[-1],
                      keep_metrics[-1])
                gen_performance.append([keep_metrics[0], keep_metrics[-1]])
            self.hyperparameters = hyperparameters
            self.keep_metrics = keep_metrics
            return (hyperparameters, keep_metrics,
                    np.array(gen_performance).reshape(-1, 2))
class PrioritizedBuffer:

    experience = namedtuple("Experience", field_names=["index","IS_weight","state", "action", "reward", "next_state", "done"])
    alpha = 0.6 # mixing pure greedy prioritization and uniform random sampling
    beta = 0.4 # compensate for the non-uniform probabilities
    beta_increment_per_sampling = 0.001
    epsilon = 0.01 # small amount to avoid zero priority
    current_length = 0

    def __init__(self, size = int(1e5), batch_size = 64, seed = 1234) :
        self.size = size
        self.batch_size = batch_size
        self.seed = random.seed(seed)
        self.memory = SumTree(capacity = self.size)

    def push(self, state, action, reward, next_state, done):
        """push new experience(s) to memory"""
        max_p = self.memory.tree[-self.memory.capacity:].max()
        priority = 1.0 if self.current_length == 0 else max_p
        data = (state, action, reward, next_state, done)
        self.memory.add(priority, data)
        self.current_length = self.current_length + 1

    def sample(self):
        sum_priority = self.memory.total()
        segment = sum_priority/self.batch_size
        samples = []
        for i in range(self.batch_size):
            a, b = segment * i, segment *(i+1)
            s = random.uniform(a,b)
            (idx, priority, data) = self.memory.get(s)
            p = priority/sum_priority
            IS_weight = (self.batch_size * p)**(-self.beta)
            samples.append(self.experience(idx, IS_weight, data[0], data[1], data[2], data[3], data[4]))

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1

        batch = self.experience(*zip(*samples))

        index = np.asarray(batch.index)

        IS_weight = np.asarray(batch.IS_weight)
        max_weight = IS_weight.max()
        IS_weight = IS_weight/max_weight

        states = np.asarray(batch.state)
        actions = np.asarray(batch.action)
        rewards = np.asarray(batch.reward)
        next_states = np.asarray(batch.next_state)
        dones = np.asarray(batch.done).astype(np.uint8)

        return (index, IS_weight, states, actions, rewards, next_states, dones)

    def update_priority(self, idxs, td_errors):
        """update priority for the replayed transitions"""
        for idx, td_error in zip(idxs, td_errors):
            priority = (td_error + self.epsilon)**self.alpha
            self.memory.update(idx, priority)

    def __len__(self):
        return self.current_length
예제 #20
0
class ReplayMemory(object):  # stored as ( s, a, r, s_ ) in SumTree
    """
    This SumTree code is modified version and the original code is from:
    https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
    """
    PER_e = 0.01  # Hyperparameter that we use to avoid some experiences to have 0 probability of being taken
    PER_a = 0.6  # Hyperparameter that we use to make a tradeoff between taking only exp with high priority and sampling randomly
    PER_b = 0.4  # importance-sampling, from initial value increasing to 1

    PER_b_increment_per_sampling = 0.001

    absolute_error_upper = 1.  # clipped abs error

    def __init__(self, capacity):
        # Making the tree
        """
        Remember that our tree is composed of a sum tree that contains the priority scores at his leaf
        And also a data array
        We don't use deque because it means that at each timestep our experiences change index by one.
        We prefer to use a simple array and to overwrite when the memory is full.
        """
        self.tree = SumTree(capacity)

    """
    Store a new experience in our tree
    Each new experience have a score of max_prority (it will be then improved when we use this exp to train our DDQN)
    """

    def store(self, experience):
        # Find the max priority
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])

        # If the max priority = 0 we can't put priority = 0 since this exp will never have a chance to be selected
        # So we use a minimum priority
        if max_priority == 0:
            max_priority = self.absolute_error_upper

        self.tree.add(max_priority, experience)  # set the max p for new p

    """
    - First, to sample a minibatch of k size, the range [0, priority_total] is / into k ranges.
    - Then a value is uniformly sampled from each range
    - We search in the sumtree, the experience where priority score correspond to sample values are retrieved from.
    - Then, we calculate IS weights for each minibatch element
    """

    def sample(self, n):
        # Create a sample array that will contains the minibatch
        memory_b = []

        b_idx, b_ISWeights = np.empty((n, ), dtype=np.int32), np.empty(
            (n, 1), dtype=np.float32)

        # Calculate the priority segment
        # Here, as explained in the paper, we divide the Range[0, ptotal] into n ranges
        priority_segment = self.tree.total_priority / n  # priority segment

        # Here we increasing the PER_b each time we sample a new minibatch
        self.PER_b = np.min(
            [1., self.PER_b + self.PER_b_increment_per_sampling])  # max = 1

        # Calculating the max_weight
        p_min = np.min(
            self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n)**(-self.PER_b)

        for i in range(n):
            """
            A value is uniformly sample from each range
            """
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            """
            Experience that correspond to each value is retrieved
            """
            index, priority, data = self.tree.get_leaf(value)

            #P(j)
            sampling_probabilities = priority / self.tree.total_priority

            #  IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            b_ISWeights[i, 0] = np.power(n * sampling_probabilities,
                                         -self.PER_b) / max_weight

            b_idx[i] = index

            experience = [data]

            memory_b.append(experience)

        return b_idx, memory_b, b_ISWeights

    """
    Update the priorities on the tree
    """

    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.PER_e  # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
        ps = np.power(np.absolute(clipped_errors), self.PER_a)

        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

    def __len__(self):
        return np.sum(self.tree.data != 0)