def next(self): """ The iterator of the dataset just performs memory sub-setting for each portion of the data. :return: piece of data for training. """ super().next() total = 0 count = 1 for element in self._dataset: current_index = element['current_sample'] total_samples = element['total_samples'] if FileAccess.MULTI == self.file_access: num_sets = list( range(0, int(math.ceil(total_samples / self.batch_size)))) else: total_samples_per_rank = int(total_samples / self.comm_size) part_start, part_end = (int(total_samples_per_rank * self.my_rank / self.batch_size), int(total_samples_per_rank * (self.my_rank + 1) / self.batch_size)) num_sets = list(range(part_start, part_end)) total += len(num_sets) if self.memory_shuffle != Shuffle.OFF: if self.memory_shuffle == Shuffle.SEED: random.seed(self.seed) random.shuffle(num_sets) for num_set in num_sets: progress(count, total, "Reading NPZ Data") count += 1 yield element['dataset'][:][:][num_set * self.batch_size:(num_set + 1) * self.batch_size - 1]
def generate(self): """ Generate csv data for training. It generates a 2d dataset and writes it to file. """ super().generate() record = random.random((self._dimension * self._dimension)) records = [record]*self.num_samples record_label = 0 prev_out_spec = "" count = 0 for i in range(0, int(self.num_files)): if i % self.comm_size == self.my_rank: progress(i+1, self.num_files, "Generating CSV Data") out_path_spec = "{}_{}_of_{}.csv".format(self._file_prefix, i, self.num_files) if count == 0: prev_out_spec = out_path_spec df = pd.DataFrame(data=records) compression = None if self.compression != Compression.NONE: compression = { "method": str(self.compression) } if self.compression == Compression.GZIP: out_path_spec = out_path_spec + ".gz" elif self.compression == Compression.BZIP2: out_path_spec = out_path_spec + ".bz2" elif self.compression == Compression.ZIP: out_path_spec = out_path_spec + ".zip" elif self.compression == Compression.XZ: out_path_spec = out_path_spec + ".xz" df.to_csv(out_path_spec, compression=compression) count += 1 else: copyfile(prev_out_spec, out_path_spec)
def next(self): """ Iterator for the CSV dataset. In this case, we used the in-memory dataset by sub-setting. """ super().next() total = 0 count = 1 for element in self._dataset: current_index = element['current_sample'] total_samples = element['total_samples'] if FileAccess.MULTI == self.file_access: num_sets = list( range(0, int(math.ceil(total_samples / self.batch_size)))) else: total_samples_per_rank = int(total_samples / self.comm_size) part_start, part_end = (int(total_samples_per_rank * self.my_rank / self.batch_size), int(total_samples_per_rank * (self.my_rank + 1) / self.batch_size)) num_sets = list(range(part_start, part_end)) total += len(num_sets) if self.memory_shuffle != Shuffle.OFF: if self.memory_shuffle == Shuffle.SEED: random.seed(self.seed) random.shuffle(num_sets) for num_set in num_sets: progress(count, total, "Reading CSV Data") count += 1 yield element['dataset'][num_set * self.batch_size:(num_set + 1) * self.batch_size - 1]
def next(self): """ This method is called during iteration where a dataset is opened and different regions of the dataset are yielded to the training loop :return: portion of dataset to be used in step. """ super().next() total = 0 count = 1 for element in self._dataset: file_h5 = h5py.File(element['file'], 'r') dataset = file_h5['records'] total_samples = dataset.shape[0] if FileAccess.MULTI == self.file_access: # for multiple file access the whole file would read by each process. num_sets = list( range(0, int(math.ceil(total_samples / self.batch_size)))) else: # for shared file access a part of file would be read by each process. total_samples_per_rank = int(total_samples / self.comm_size) part_start, part_end = (int(total_samples_per_rank * self.my_rank / self.batch_size), int(total_samples_per_rank * (self.my_rank + 1) / self.batch_size)) num_sets = list(range(part_start, part_end)) total += len(num_sets) if self.memory_shuffle != Shuffle.OFF: if self.memory_shuffle == Shuffle.SEED: random.seed(self.seed) random.shuffle(num_sets) for num_set in num_sets: with tf.profiler.experimental.Trace('Read', step_num=num_set / self.batch_size, _r=1): progress(count, total, "Reading HDF5 Data") count += 1 images = dataset[num_set * self.batch_size:(num_set + 1) * self.batch_size] resized_images = [] with tf.profiler.experimental.Trace('Resize', step_num=num_set / self.batch_size, _r=1): for image in images: resized_images.append( np.resize(image, (self._dimension, self._dimension))) sleep(.001) yield resized_images file_h5.close()
def next(self): """ This method is called during iteration where a dataset is opened and different regions of the dataset are yielded to the training loop :return: portion of dataset to be used in step. """ super().next() a = iter(self._dataset) count = 1 total = math.ceil(self.num_samples * self.num_files / self.batch_size / self.comm_size) for i in a: progress(count, total, "Reading HDF5 Optimized Data") count += 1 yield i if count > total: break
def next(self): """ Provides the iterator over tfrecord data pipeline. :return: data to be processed by the training step. """ super().next() a = iter(self._dataset) count = 1 total = math.ceil(self.num_samples * self.num_files / self.batch_size / self.comm_size) for i in a: progress(count, total, "Reading TFRecord Data") count += 1 yield i yield next(a) if count > total: break
def read(self, epoch_number): """ Opens the CSV dataset and reads the rows in memory. :param epoch_number: current epoch number """ super().read(epoch_number) packed_array = [] count = 1 for file in self._local_file_list: progress(count, len(self._local_file_list), "Opening CSV Data") count += 1 rows = pd.read_csv(file, compression="infer").to_numpy() packed_array.append({ 'dataset': rows, 'current_sample': 0, 'total_samples': len(rows) }) self._dataset = packed_array
def generate(self): """ Generate hdf5 data for training. It generates a 3d dataset and writes it to file. """ super().generate() samples_per_iter=1024*100 records = random.random((samples_per_iter, self._dimension, self._dimension)) record_labels = [0] * self.num_samples prev_out_spec = "" count = 0 for i in range(0, int(self.num_files)): if i % self.comm_size == self.my_rank: progress(i+1, self.num_files, "Generating HDF5 Data") out_path_spec = "{}_{}_of_{}.h5".format(self._file_prefix, i+1, self.num_files) if count == 0: prev_out_spec = out_path_spec hf = h5py.File(out_path_spec, 'w') chunks = None if self.enable_chunking: chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size))) if chunk_dimension > self._dimension: chunk_dimension = self._dimension chunks = (1, chunk_dimension, chunk_dimension) compression = None compression_level = None if self.compression != Compression.NONE: compression = str(self.compression) if self.compression == Compression.GZIP: compression_level = self.compression_level dset = hf.create_dataset('records', (self.num_samples,self._dimension, self._dimension), chunks=chunks, compression=compression, compression_opts=compression_level) samples_written = 0 while samples_written < self.num_samples: if samples_per_iter < self.num_samples-samples_written: samples_to_write = samples_per_iter else: samples_to_write = self.num_samples-samples_written dset[samples_written:samples_written+samples_to_write] = records[:samples_to_write] samples_written += samples_to_write hf.create_dataset('labels', data=record_labels) hf.close() count += 1 else: copyfile(prev_out_spec, out_path_spec)
def generate(self): """ Generator for creating data in TFRecord format of 3d dataset. """ super().generate() record = random.random((self._dimension, self._dimension)) record_label = 0 prev_out_spec = "" count = 0 for i in range(0, int(self.num_files)): if i % self.comm_size == self.my_rank: progress(i + 1, self.num_files, "Generating TFRecord Data") out_path_spec = "{}_{}_of_{}.tfrecords".format( self._file_prefix, i, self.num_files) # Open a TFRecordWriter for the output-file. if count == 0: prev_out_spec = out_path_spec with tf.io.TFRecordWriter(out_path_spec) as writer: for i in range(0, self.num_samples): img_bytes = record.tostring() data = { 'image': tf.train.Feature(bytes_list=tf.train.BytesList( value=[img_bytes])), 'label': tf.train.Feature(int64_list=tf.train.Int64List( value=[record_label])) } # Wrap the data as TensorFlow Features. feature = tf.train.Features(feature=data) # Wrap again as a TensorFlow Example. example = tf.train.Example(features=feature) # Serialize the data. serialized = example.SerializeToString() # Write the serialized data to the TFRecords file. writer.write(serialized) count += 1 else: copyfile(prev_out_spec, out_path_spec)
def read(self, epoch_number): """ Reading the hdf5 dataset. Here we take just take the filename and they are open during iteration :param epoch_number: epoch number for training loop """ super().read(epoch_number) packed_array = [] count = 1 for file in self._local_file_list: progress(count, len(self._local_file_list), "Opening HDF5 Data") count += 1 file_h5 = h5py.File(file, 'r') dimention = int(math.sqrt(self.record_size)) sample = (dimention, dimention) dataset_h = file_h5['records'] current_sample = 0 packed_array.append({ 'file': file, 'sample': sample, 'current_sample': current_sample }) file_h5.close() self._dataset = packed_array