Exemplo n.º 1
0
    def get_batches(self,
                    batch_size,
                    num_batches=None,
                    shuffle=False,
                    cluster=False):
        """

        :param batch_size:
        :param num_batches:
        :param shuffle:
        :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
        :return:
        """
        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
        if num_batches is None:
            num_batches = num_batches_per_epoch
        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))

        if shuffle:
            random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
            if cluster:
                sorted_idxs = sorted(random_idxs, key=self._sort_key)
                sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
                grouped = lambda: random.sample(sorted_grouped(),
                                                num_batches_per_epoch)
            else:
                random_grouped = lambda: list(grouper(random_idxs, batch_size))
                grouped = random_grouped
        else:
            raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
            grouped = raw_grouped

        batch_idx_tuples = itertools.chain.from_iterable(
            grouped() for _ in range(num_epochs))
        for _ in range(num_batches):
            batch_idxs = tuple(i for i in next(batch_idx_tuples)
                               if i is not None)
            batch_data = self.get_by_idxs(batch_idxs)
            shared_batch_data = {}
            for key, val in batch_data.items():
                if key.startswith('*'):
                    assert self.shared is not None
                    shared_key = key[1:]
                    for each in val:
                        with open("debugfile", "a") as fh:
                            fh.write(
                                str(each) + '\n' +
                                str(index(self.shared[shared_key], each)) +
                                '\n')
                    shared_batch_data[shared_key] = [
                        index(self.shared[shared_key], each) for each in val
                    ]
                    print(shared_batch_data[shared_key][0])
            batch_data.update(shared_batch_data)

            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
            yield batch_idxs, batch_ds
Exemplo n.º 2
0
    def get_batches(self, batch_size, num_batches=None, shuffle=False):
        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
        if num_batches is None:
            num_batches = num_batches_per_epoch
        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))

        idxs = itertools.chain.from_iterable(
            random.sample(self.valid_idxs, len(self.valid_idxs)
                          ) if shuffle else self.valid_idxs
            for _ in range(num_epochs))
        for _ in range(num_batches):
            batch_idxs = tuple(itertools.islice(idxs, batch_size))
            batch_data = {}
            for key, val in self.data.items():
                if key.startswith('*'):
                    assert self.shared is not None
                    shared_key = key[1:]
                    batch_data[shared_key] = [
                        index(self.shared[shared_key], val[idx])
                        for idx in batch_idxs
                    ]
                else:
                    batch_data[key] = list(map(val.__getitem__, batch_idxs))

            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
            yield batch_idxs, batch_ds
Exemplo n.º 3
0
    def get_batches(self,
                    batch_size,
                    num_batches=None,
                    shuffle=False,
                    cluster=False):
        """

        :param batch_size:
        :param num_batches:
        :param shuffle:
        :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
        :return:
        """
        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
        if num_batches is None:
            num_batches = num_batches_per_epoch
        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))

        if shuffle:
            random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
            if cluster:
                sorted_idxs = sorted(random_idxs, key=self._sort_key)
                sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
                grouped = lambda: random.sample(sorted_grouped(),
                                                num_batches_per_epoch)
            else:
                random_grouped = lambda: list(grouper(random_idxs, batch_size))
                grouped = random_grouped
        else:
            raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
            grouped = raw_grouped

        batch_idx_tuples = itertools.chain.from_iterable(
            grouped() for _ in range(num_epochs))
        #chain.from_iterable(iterables):一个备用链构造函数,其中的iterables是一个迭代变量,生成迭代序列,此操作的结果与以下生成器代码片段生成的结果相同:
        #>>> from itertools import chain
        #>>> test = chain.from_iterable('ABCDEF')
        #>>> test.next()
        #'A'
        for _ in range(num_batches):
            batch_idxs = tuple(i for i in next(batch_idx_tuples)
                               if i is not None)
            batch_data = self.get_by_idxs(batch_idxs)
            shared_batch_data = {}
            for key, val in batch_data.items():
                if key.startswith('*'):
                    assert self.shared is not None
                    shared_key = key[1:]
                    shared_batch_data[shared_key] = [
                        index(self.shared[shared_key], each) for each in val
                    ]
            batch_data.update(shared_batch_data)

            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
            yield batch_idxs, batch_ds
Exemplo n.º 4
0
    def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False):
        """

        :param batch_size:
        :param num_batches:
        :param shuffle:
        :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
        :return:
        """
        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
        if num_batches is None:
            num_batches = num_batches_per_epoch
        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))

        if shuffle:
            random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
            if cluster:
                sorted_idxs = sorted(random_idxs, key=self._sort_key)
                sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
                grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch)
            else:
                random_grouped = lambda: list(grouper(random_idxs, batch_size))
                grouped = random_grouped
        else:
            raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
            grouped = raw_grouped

        batch_idx_tuples = itertools.chain.from_iterable(grouped() for _ in range(num_epochs))
        for _ in range(num_batches):
            batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None)
            batch_data = self.get_by_idxs(batch_idxs)
            shared_batch_data = {}
            for key, val in batch_data.items():
                if key.startswith('*'):
                    assert self.shared is not None
                    shared_key = key[1:]
                    shared_batch_data[shared_key] = [index(self.shared[shared_key], each) for each in val]
            batch_data.update(shared_batch_data)

            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
            yield batch_idxs, batch_ds
Exemplo n.º 5
0
    def get_batches(self, batch_size, num_batches=None, shuffle=False):
        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
        if num_batches is None:
            num_batches = num_batches_per_epoch
        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))

        idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs))
                                             if shuffle else self.valid_idxs
                                             for _ in range(num_epochs))
        for _ in range(num_batches):
            batch_idxs = tuple(itertools.islice(idxs, batch_size))
            batch_data = {}
            for key, val in self.data.items():
                if key.startswith('*'):
                    assert self.shared is not None
                    shared_key = key[1:]
                    batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs]
                else:
                    batch_data[key] = list(map(val.__getitem__, batch_idxs))

            batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
            yield batch_idxs, batch_ds
Exemplo n.º 6
0
    def get_batches(self,
                    batch_size,
                    num_batches=None,
                    shuffle=False,
                    cluster=False):
        """

        :param batch_size:
        :param num_batches:
        :param shuffle:
        :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training).
        :return:
        """
        num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
        if num_batches is None:
            num_batches = num_batches_per_epoch
        num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))

        if shuffle:
            random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs))
            if cluster:
                sorted_idxs = sorted(random_idxs, key=self._sort_key)
                sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size))
                grouped = lambda: random.sample(sorted_grouped(),
                                                num_batches_per_epoch)
            else:
                random_grouped = lambda: list(grouper(random_idxs, batch_size))
                grouped = random_grouped
        else:
            raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size))
            grouped = raw_grouped

        batch_idx_tuples = itertools.chain.from_iterable(
            grouped() for _ in range(num_epochs))

        def get_shared(pos):
            with open(
                    self.shared_path %
                (str(pos[0]).zfill(3), str(pos[1]).zfill(3) + ".json"),
                    'r') as fd:
                return json.load(fd)

        for _ in range(num_batches):
            batch_idxs = tuple(i for i in next(batch_idx_tuples)
                               if i is not None)
            batch_data = self.get_by_idxs(batch_idxs)
            shared_batch_data = {}

            if self.load_shared:
                pos = batch_data['*x']
                shared_list = [get_shared(each) for each in pos]
                for k in ['p', 'x', 'cx']:
                    shared_batch_data[k] = [sh[k] for sh in shared_list]
            else:
                for key, val in batch_data.items():
                    if key.startswith('*'):
                        assert self.shared is not None
                        shared_key = key[1:]
                        shared_batch_data[shared_key] = [
                            index(self.shared[shared_key], each)
                            for each in val
                        ]
            batch_data.update(shared_batch_data)

            batch_ds = DataSet(batch_data,
                               self.data_type,
                               self.shared_path,
                               load_shared=self.load_shared,
                               shared=self.shared)
            yield batch_idxs, batch_ds