def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False): """ :param batch_size: :param num_batches: :param shuffle: :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training). :return: """ num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) if num_batches is None: num_batches = num_batches_per_epoch num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) if shuffle: random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs)) if cluster: sorted_idxs = sorted(random_idxs, key=self._sort_key) sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size)) grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch) else: random_grouped = lambda: list(grouper(random_idxs, batch_size)) grouped = random_grouped else: raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size)) grouped = raw_grouped batch_idx_tuples = itertools.chain.from_iterable( grouped() for _ in range(num_epochs)) for _ in range(num_batches): batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None) batch_data = self.get_by_idxs(batch_idxs) shared_batch_data = {} for key, val in batch_data.items(): if key.startswith('*'): assert self.shared is not None shared_key = key[1:] for each in val: with open("debugfile", "a") as fh: fh.write( str(each) + '\n' + str(index(self.shared[shared_key], each)) + '\n') shared_batch_data[shared_key] = [ index(self.shared[shared_key], each) for each in val ] print(shared_batch_data[shared_key][0]) batch_data.update(shared_batch_data) batch_ds = DataSet(batch_data, self.data_type, shared=self.shared) yield batch_idxs, batch_ds
def get_batches(self, batch_size, num_batches=None, shuffle=False): num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) if num_batches is None: num_batches = num_batches_per_epoch num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) idxs = itertools.chain.from_iterable( random.sample(self.valid_idxs, len(self.valid_idxs) ) if shuffle else self.valid_idxs for _ in range(num_epochs)) for _ in range(num_batches): batch_idxs = tuple(itertools.islice(idxs, batch_size)) batch_data = {} for key, val in self.data.items(): if key.startswith('*'): assert self.shared is not None shared_key = key[1:] batch_data[shared_key] = [ index(self.shared[shared_key], val[idx]) for idx in batch_idxs ] else: batch_data[key] = list(map(val.__getitem__, batch_idxs)) batch_ds = DataSet(batch_data, self.data_type, shared=self.shared) yield batch_idxs, batch_ds
def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False): """ :param batch_size: :param num_batches: :param shuffle: :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training). :return: """ num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) if num_batches is None: num_batches = num_batches_per_epoch num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) if shuffle: random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs)) if cluster: sorted_idxs = sorted(random_idxs, key=self._sort_key) sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size)) grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch) else: random_grouped = lambda: list(grouper(random_idxs, batch_size)) grouped = random_grouped else: raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size)) grouped = raw_grouped batch_idx_tuples = itertools.chain.from_iterable( grouped() for _ in range(num_epochs)) #chain.from_iterable(iterables):一个备用链构造函数,其中的iterables是一个迭代变量,生成迭代序列,此操作的结果与以下生成器代码片段生成的结果相同: #>>> from itertools import chain #>>> test = chain.from_iterable('ABCDEF') #>>> test.next() #'A' for _ in range(num_batches): batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None) batch_data = self.get_by_idxs(batch_idxs) shared_batch_data = {} for key, val in batch_data.items(): if key.startswith('*'): assert self.shared is not None shared_key = key[1:] shared_batch_data[shared_key] = [ index(self.shared[shared_key], each) for each in val ] batch_data.update(shared_batch_data) batch_ds = DataSet(batch_data, self.data_type, shared=self.shared) yield batch_idxs, batch_ds
def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False): """ :param batch_size: :param num_batches: :param shuffle: :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training). :return: """ num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) if num_batches is None: num_batches = num_batches_per_epoch num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) if shuffle: random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs)) if cluster: sorted_idxs = sorted(random_idxs, key=self._sort_key) sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size)) grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch) else: random_grouped = lambda: list(grouper(random_idxs, batch_size)) grouped = random_grouped else: raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size)) grouped = raw_grouped batch_idx_tuples = itertools.chain.from_iterable(grouped() for _ in range(num_epochs)) for _ in range(num_batches): batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None) batch_data = self.get_by_idxs(batch_idxs) shared_batch_data = {} for key, val in batch_data.items(): if key.startswith('*'): assert self.shared is not None shared_key = key[1:] shared_batch_data[shared_key] = [index(self.shared[shared_key], each) for each in val] batch_data.update(shared_batch_data) batch_ds = DataSet(batch_data, self.data_type, shared=self.shared) yield batch_idxs, batch_ds
def get_batches(self, batch_size, num_batches=None, shuffle=False): num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) if num_batches is None: num_batches = num_batches_per_epoch num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs)) if shuffle else self.valid_idxs for _ in range(num_epochs)) for _ in range(num_batches): batch_idxs = tuple(itertools.islice(idxs, batch_size)) batch_data = {} for key, val in self.data.items(): if key.startswith('*'): assert self.shared is not None shared_key = key[1:] batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs] else: batch_data[key] = list(map(val.__getitem__, batch_idxs)) batch_ds = DataSet(batch_data, self.data_type, shared=self.shared) yield batch_idxs, batch_ds
def get_batches(self, batch_size, num_batches=None, shuffle=False, cluster=False): """ :param batch_size: :param num_batches: :param shuffle: :param cluster: cluster examples by their lengths; this might give performance boost (i.e. faster training). :return: """ num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size)) if num_batches is None: num_batches = num_batches_per_epoch num_epochs = int(math.ceil(num_batches / num_batches_per_epoch)) if shuffle: random_idxs = random.sample(self.valid_idxs, len(self.valid_idxs)) if cluster: sorted_idxs = sorted(random_idxs, key=self._sort_key) sorted_grouped = lambda: list(grouper(sorted_idxs, batch_size)) grouped = lambda: random.sample(sorted_grouped(), num_batches_per_epoch) else: random_grouped = lambda: list(grouper(random_idxs, batch_size)) grouped = random_grouped else: raw_grouped = lambda: list(grouper(self.valid_idxs, batch_size)) grouped = raw_grouped batch_idx_tuples = itertools.chain.from_iterable( grouped() for _ in range(num_epochs)) def get_shared(pos): with open( self.shared_path % (str(pos[0]).zfill(3), str(pos[1]).zfill(3) + ".json"), 'r') as fd: return json.load(fd) for _ in range(num_batches): batch_idxs = tuple(i for i in next(batch_idx_tuples) if i is not None) batch_data = self.get_by_idxs(batch_idxs) shared_batch_data = {} if self.load_shared: pos = batch_data['*x'] shared_list = [get_shared(each) for each in pos] for k in ['p', 'x', 'cx']: shared_batch_data[k] = [sh[k] for sh in shared_list] else: for key, val in batch_data.items(): if key.startswith('*'): assert self.shared is not None shared_key = key[1:] shared_batch_data[shared_key] = [ index(self.shared[shared_key], each) for each in val ] batch_data.update(shared_batch_data) batch_ds = DataSet(batch_data, self.data_type, self.shared_path, load_shared=self.load_shared, shared=self.shared) yield batch_idxs, batch_ds