Exemplo n.º 1
0
 def build(self, samples):
     if not isinstance(samples, np.ndarray):
         samples = np.array(samples)
     self._samples = samples
     try:
         lens = np.array([len(x) for x in samples])
     except:
         lens = np.array([x.shape[0] for x in samples])
     self.sorted_indices = np.argsort(-lens)
     lens = lens[self.sorted_indices]
     n_samples = self._samples.shape[0]
     self.bins_bucket_edges = list(
         range(0, n_samples, int(n_samples / self.bins_count)))
     if len(self.bins_bucket_edges) == self.bins_count + 1:
         self.bins_bucket_edges[-1] = n_samples
     else:
         self.bins_bucket_edges.append(n_samples)
     self.min_gap = min([
         self.bins_bucket_edges[i + 1] - self.bins_bucket_edges[i]
         for i in range(self.bins_count)
     ])
     self.bins_lens = [
         np.max(lens[self.bins_bucket_edges[i]:self.bins_bucket_edges[i +
                                                                      1]])
         for i in range(self.bins_count)
     ]
     self.choice_roulette = reduce(lambda x, y: x + y, [[i] * np.ceil(
         (self.bins_bucket_edges[i + 1] - self.bins_bucket_edges[i]) /
         self.chunk_size).astype(np.int32) for i in range(self.bins_count)])
     self.choice_index = MutexVariable(0, name="choice_index")
     if self.roulette_cycle is None:
         self.steps = len(self.choice_roulette)
     else:
         self.steps = len(self.roulette_cycle)
Exemplo n.º 2
0
class SimpleInMemorySamplePool(SamplePool):
    def __init__(self, samples, chunk_size):
        super(SimpleInMemorySamplePool, self).__init__(chunk_size)
        self.samples = samples
        self.iter_index = MutexVariable(0)

    def reset(self):
        self.iter_index = MutexVariable(0)

    def extend(self, samplepool_to_extend):
        super(SimpleInMemorySamplePool, self).extend(samplepool_to_extend)
        self.samples = np.concatenate(
            [self.samples, samplepool_to_extend.samples])

    def __next__(self):
        chunk_size = self.chunk_size
        n_samples = len(self.samples)
        samples = self.samples

        self.iter_index.acquire()
        iter_index_value = self.iter_index.value
        new_iter_index_value = (iter_index_value + chunk_size) % n_samples
        self.iter_index.value = new_iter_index_value
        self.iter_index.release()

        if iter_index_value + chunk_size <= n_samples:
            batch_samples = samples[iter_index_value:iter_index_value +
                                    chunk_size]
            chunk_indices_list = np.array(
                list(range(iter_index_value, iter_index_value + chunk_size)))
        else:
            iter_samples_1 = samples[iter_index_value:]
            chunk_indices_1 = list(range(iter_index_value, n_samples))
            iter_samples_2 = samples[:new_iter_index_value]
            chunk_indices_2 = list(range(0, new_iter_index_value))
            batch_samples = np.concatenate([iter_samples_1, iter_samples_2])
            chunk_indices_list = np.concatenate(
                [chunk_indices_1, chunk_indices_2])
        return batch_samples, chunk_indices_list
Exemplo n.º 3
0
    def __init__(self,
                 samples,
                 bins_count,
                 batch_size=1000,
                 pading_val=0,
                 unkown_word_indicator=0,
                 mode="random",
                 y_is_sequence=False,
                 cut=True,
                 roulette_cycle=None,
                 y=None,
                 get_y_in_batch=True,
                 get_sequence_len_in_batch=True,
                 unroll_num=-1):
        super(AutoPaddingInMemorySamplePool,
              self).__init__(samples=np.ndarray(1), chunk_size=batch_size)
        self.bins_count = bins_count
        self.pading_val = pading_val
        self.unkown_word_indicator = unkown_word_indicator
        self.iter_index_map = dict([(i,
                                     MutexVariable(0,
                                                   name="iter_index_%s" % i))
                                    for i in range(self.bins_count)])
        assert mode in ["random", "specific"]
        self.mode = mode
        self.cut = cut
        self.roulette_cycle = roulette_cycle
        self.y = y
        self._sorted_y = None
        self.get_y_in_batch = get_y_in_batch
        self.get_sequence_len_in_batch = get_sequence_len_in_batch
        self._sequence_len = None
        self.unroll_num = unroll_num
        self.y_is_sequence = y_is_sequence
        if unroll_num > 0:
            if not y_is_sequence:
                unroller = Unroller(samples, y, unroll_num=unroll_num)
                samples = unroller.X
                self.y = unroller.y
                self._sequence_len = unroller.X_sequence_len
            else:
                unroller = Unroller(samples, None, unroll_num=unroll_num)
                samples = unroller.X
                unroller = Unroller(y, None, unroll_num=unroll_num)
                self.y = unroller.X
                self._sequence_len = unroller.X_sequence_len

        self.build(samples)
Exemplo n.º 4
0
 def reset(self):
     self.iter_index = MutexVariable(0)
Exemplo n.º 5
0
 def __init__(self, samples, chunk_size):
     super(SimpleInMemorySamplePool, self).__init__(chunk_size)
     self.samples = samples
     self.iter_index = MutexVariable(0)
Exemplo n.º 6
0
 def reset(self):
     self.iter_index_map = dict([(i,
                                  MutexVariable(0,
                                                name="iter_index_%s" % i))
                                 for i in range(self.bins_count)])
     self.choice_index = MutexVariable(0, name="choice_index")
Exemplo n.º 7
0
class AutoPaddingInMemorySamplePool(InMemorySamplePool):
    def __init__(self,
                 samples,
                 bins_count,
                 batch_size=1000,
                 pading_val=0,
                 unkown_word_indicator=0,
                 mode="random",
                 y_is_sequence=False,
                 cut=True,
                 roulette_cycle=None,
                 y=None,
                 get_y_in_batch=True,
                 get_sequence_len_in_batch=True,
                 unroll_num=-1):
        super(AutoPaddingInMemorySamplePool,
              self).__init__(samples=np.ndarray(1), chunk_size=batch_size)
        self.bins_count = bins_count
        self.pading_val = pading_val
        self.unkown_word_indicator = unkown_word_indicator
        self.iter_index_map = dict([(i,
                                     MutexVariable(0,
                                                   name="iter_index_%s" % i))
                                    for i in range(self.bins_count)])
        assert mode in ["random", "specific"]
        self.mode = mode
        self.cut = cut
        self.roulette_cycle = roulette_cycle
        self.y = y
        self._sorted_y = None
        self.get_y_in_batch = get_y_in_batch
        self.get_sequence_len_in_batch = get_sequence_len_in_batch
        self._sequence_len = None
        self.unroll_num = unroll_num
        self.y_is_sequence = y_is_sequence
        if unroll_num > 0:
            if not y_is_sequence:
                unroller = Unroller(samples, y, unroll_num=unroll_num)
                samples = unroller.X
                self.y = unroller.y
                self._sequence_len = unroller.X_sequence_len
            else:
                unroller = Unroller(samples, None, unroll_num=unroll_num)
                samples = unroller.X
                unroller = Unroller(y, None, unroll_num=unroll_num)
                self.y = unroller.X
                self._sequence_len = unroller.X_sequence_len

        self.build(samples)

    def build(self, samples):
        if not isinstance(samples, np.ndarray):
            samples = np.array(samples)
        self._samples = samples
        try:
            lens = np.array([len(x) for x in samples])
        except:
            lens = np.array([x.shape[0] for x in samples])
        self.sorted_indices = np.argsort(-lens)
        lens = lens[self.sorted_indices]
        n_samples = self._samples.shape[0]
        self.bins_bucket_edges = list(
            range(0, n_samples, int(n_samples / self.bins_count)))
        if len(self.bins_bucket_edges) == self.bins_count + 1:
            self.bins_bucket_edges[-1] = n_samples
        else:
            self.bins_bucket_edges.append(n_samples)
        self.min_gap = min([
            self.bins_bucket_edges[i + 1] - self.bins_bucket_edges[i]
            for i in range(self.bins_count)
        ])
        self.bins_lens = [
            np.max(lens[self.bins_bucket_edges[i]:self.bins_bucket_edges[i +
                                                                         1]])
            for i in range(self.bins_count)
        ]
        self.choice_roulette = reduce(lambda x, y: x + y, [[i] * np.ceil(
            (self.bins_bucket_edges[i + 1] - self.bins_bucket_edges[i]) /
            self.chunk_size).astype(np.int32) for i in range(self.bins_count)])
        self.choice_index = MutexVariable(0, name="choice_index")
        if self.roulette_cycle is None:
            self.steps = len(self.choice_roulette)
        else:
            self.steps = len(self.roulette_cycle)

    def reset(self):
        self.iter_index_map = dict([(i,
                                     MutexVariable(0,
                                                   name="iter_index_%s" % i))
                                    for i in range(self.bins_count)])
        self.choice_index = MutexVariable(0, name="choice_index")

    def extend(self, samplepool_to_extend):
        super(AutoPaddingInMemorySamplePool, self).extend(samplepool_to_extend)
        if isinstance(samplepool_to_extend, AutoPaddingInMemorySamplePool):
            self.build(
                np.concatenate([self._samples, samplepool_to_extend._samples]))
        else:
            self.build(
                np.concatenate([self._samples, samplepool_to_extend.samples]))

    @property
    def sorted_samples(self):
        return self._samples[self.sorted_indices]

    @property
    def sorted_y(self):
        if self.y is None:
            raise Exception("y is None")
        if self._sorted_y is None:
            self._sorted_y = np.array(self.y,
                                      dtype=np.int32)[self.sorted_indices]
        return self._sorted_y

    @property
    def sequence_len(self):
        if self._sequence_len is None:
            raise Exception(
                "sequence_len is None, only available when unroll_num > 0")
        if isinstance(self._sequence_len, list):
            self._sequence_len = np.array(
                self._sequence_len)[self.sorted_indices]
        return self._sequence_len

    # parallel on pool level, serial on bucket level
    def __next__(self):
        if self.mode == "random":
            start_index_i = np.random.choice(self.bins_count, 1)[0]
            cut = False
        else:
            if self.roulette_cycle is None:
                self.choice_index.acquire()
                start_index_i = self.choice_roulette[self.choice_index.value]
                self.choice_index.value = (self.choice_index + 1) % len(
                    self.choice_roulette)
                self.choice_index.release()
            else:
                self.choice_index.acquire()
                start_index_i = self.choice_roulette[self.roulette_cycle[
                    self.choice_index.value]]
                self.choice_index.value = (self.choice_index + 1) % len(
                    self.roulette_cycle)
                self.choice_index.release()
            cut = self.cut
        end_index_i = start_index_i + 1
        start_index = self.bins_bucket_edges[start_index_i]
        end_index = self.bins_bucket_edges[end_index_i]
        samples = self._samples[self.sorted_indices[start_index:end_index]]
        if isinstance(self.iter_index_map[start_index_i], MutexVariable):
            # release in __next__
            self.iter_index_map[start_index_i].acquire()
        iter_index = self.iter_index_map[start_index_i]
        batch_samples, chunk_indices_list = super(
            AutoPaddingInMemorySamplePool,
            self).__next__(cut=cut,
                           samples=samples,
                           iter_index=iter_index,
                           return_chunk_indices_list=True)
        self.iter_index_map[start_index_i] = iter_index
        batch_len = self.bins_lens[start_index_i]
        if isinstance(batch_samples[0], (coo_matrix, csr_matrix, csc_matrix)):
            return_batch_samples = np.zeros(
                (len(batch_samples), batch_len, batch_samples[0].shape[1]),
                dtype=np.int32)
        else:
            return_batch_samples = np.zeros((len(batch_samples), batch_len),
                                            dtype=np.int32)
        cur_batch_sequence_len = np.zeros(len(batch_samples))
        for i, sample in enumerate(batch_samples):
            if not isinstance(sample, (coo_matrix, csr_matrix, csc_matrix)):
                if len(sample) < batch_len:
                    padding_size = batch_len - len(sample)
                    sample = np.pad(
                        sample, [(0, padding_size)],
                        "constant",
                        constant_values=[self.unkown_word_indicator] * 2)
                return_batch_samples[i] = sample
                if self.unroll_num < 0:
                    cur_batch_sequence_len[i] = len(sample)
            elif isinstance(sample, (coo_matrix, csr_matrix, csc_matrix)):
                return_batch_samples[i, sample.row,
                                     sample.col] = sample.data  # 1
                if self.unroll_num < 0:
                    cur_batch_sequence_len[i] = len(sample.data)
        chunk_indices_list += start_index
        # unroll > 0
        if self.unroll_num > 0:
            cur_batch_sequence_len = self.sequence_len[chunk_indices_list]
        return_tuple = [return_batch_samples]
        if self.get_y_in_batch and self.y:
            return_tuple.append(self.sorted_y[chunk_indices_list])
        if self.get_sequence_len_in_batch:
            return_tuple.append(cur_batch_sequence_len.reshape((-1, 1)))
        return return_tuple