示例#1
0
 def next(self):
     """
     The iterator of the dataset just performs memory sub-setting for each portion of the data.
     :return: piece of data for training.
     """
     super().next()
     total = 0
     count = 1
     for element in self._dataset:
         current_index = element['current_sample']
         total_samples = element['total_samples']
         if FileAccess.MULTI == self.file_access:
             num_sets = list(
                 range(0, int(math.ceil(total_samples / self.batch_size))))
         else:
             total_samples_per_rank = int(total_samples / self.comm_size)
             part_start, part_end = (int(total_samples_per_rank *
                                         self.my_rank / self.batch_size),
                                     int(total_samples_per_rank *
                                         (self.my_rank + 1) /
                                         self.batch_size))
             num_sets = list(range(part_start, part_end))
         total += len(num_sets)
         if self.memory_shuffle != Shuffle.OFF:
             if self.memory_shuffle == Shuffle.SEED:
                 random.seed(self.seed)
             random.shuffle(num_sets)
         for num_set in num_sets:
             progress(count, total, "Reading NPZ Data")
             count += 1
             yield element['dataset'][:][:][num_set *
                                            self.batch_size:(num_set + 1) *
                                            self.batch_size - 1]
 def generate(self):
     """
     Generate csv data for training. It generates a 2d dataset and writes it to file.
     """
     super().generate()
     record = random.random((self._dimension * self._dimension))
     records = [record]*self.num_samples
     record_label = 0
     prev_out_spec = ""
     count = 0
     for i in range(0, int(self.num_files)):
         if i % self.comm_size == self.my_rank:
             progress(i+1, self.num_files, "Generating CSV Data")
             out_path_spec = "{}_{}_of_{}.csv".format(self._file_prefix, i, self.num_files)
             if count == 0:
                 prev_out_spec = out_path_spec
                 df = pd.DataFrame(data=records)
                 compression = None
                 if self.compression != Compression.NONE:
                     compression = {
                         "method": str(self.compression)
                     }
                     if self.compression == Compression.GZIP:
                         out_path_spec = out_path_spec + ".gz"
                     elif self.compression == Compression.BZIP2:
                         out_path_spec = out_path_spec + ".bz2"
                     elif self.compression == Compression.ZIP:
                         out_path_spec = out_path_spec + ".zip"
                     elif self.compression == Compression.XZ:
                         out_path_spec = out_path_spec + ".xz"
                 df.to_csv(out_path_spec, compression=compression)
                 count += 1
             else:
                 copyfile(prev_out_spec, out_path_spec)
示例#3
0
 def next(self):
     """
     Iterator for the CSV dataset. In this case, we used the in-memory dataset by sub-setting.
     """
     super().next()
     total = 0
     count = 1
     for element in self._dataset:
         current_index = element['current_sample']
         total_samples = element['total_samples']
         if FileAccess.MULTI == self.file_access:
             num_sets = list(
                 range(0, int(math.ceil(total_samples / self.batch_size))))
         else:
             total_samples_per_rank = int(total_samples / self.comm_size)
             part_start, part_end = (int(total_samples_per_rank *
                                         self.my_rank / self.batch_size),
                                     int(total_samples_per_rank *
                                         (self.my_rank + 1) /
                                         self.batch_size))
             num_sets = list(range(part_start, part_end))
         total += len(num_sets)
         if self.memory_shuffle != Shuffle.OFF:
             if self.memory_shuffle == Shuffle.SEED:
                 random.seed(self.seed)
             random.shuffle(num_sets)
         for num_set in num_sets:
             progress(count, total, "Reading CSV Data")
             count += 1
             yield element['dataset'][num_set *
                                      self.batch_size:(num_set + 1) *
                                      self.batch_size - 1]
示例#4
0
 def next(self):
     """
     This method is called during iteration where a dataset is opened and different regions of the dataset are
     yielded to the training loop
     :return: portion of dataset to be used in step.
     """
     super().next()
     total = 0
     count = 1
     for element in self._dataset:
         file_h5 = h5py.File(element['file'], 'r')
         dataset = file_h5['records']
         total_samples = dataset.shape[0]
         if FileAccess.MULTI == self.file_access:
             # for multiple file access the whole file would read by each process.
             num_sets = list(
                 range(0, int(math.ceil(total_samples / self.batch_size))))
         else:
             # for shared file access a part of file would be read by each process.
             total_samples_per_rank = int(total_samples / self.comm_size)
             part_start, part_end = (int(total_samples_per_rank *
                                         self.my_rank / self.batch_size),
                                     int(total_samples_per_rank *
                                         (self.my_rank + 1) /
                                         self.batch_size))
             num_sets = list(range(part_start, part_end))
         total += len(num_sets)
         if self.memory_shuffle != Shuffle.OFF:
             if self.memory_shuffle == Shuffle.SEED:
                 random.seed(self.seed)
             random.shuffle(num_sets)
         for num_set in num_sets:
             with tf.profiler.experimental.Trace('Read',
                                                 step_num=num_set /
                                                 self.batch_size,
                                                 _r=1):
                 progress(count, total, "Reading HDF5 Data")
                 count += 1
                 images = dataset[num_set * self.batch_size:(num_set + 1) *
                                  self.batch_size]
             resized_images = []
             with tf.profiler.experimental.Trace('Resize',
                                                 step_num=num_set /
                                                 self.batch_size,
                                                 _r=1):
                 for image in images:
                     resized_images.append(
                         np.resize(image,
                                   (self._dimension, self._dimension)))
                 sleep(.001)
             yield resized_images
         file_h5.close()
 def next(self):
     """
     This method is called during iteration where a dataset is opened and different regions of the dataset are
     yielded to the training loop
     :return: portion of dataset to be used in step.
     """
     super().next()
     a = iter(self._dataset)
     count = 1
     total = math.ceil(self.num_samples * self.num_files / self.batch_size /
                       self.comm_size)
     for i in a:
         progress(count, total, "Reading HDF5 Optimized Data")
         count += 1
         yield i
         if count > total:
             break
示例#6
0
 def next(self):
     """
     Provides the iterator over tfrecord data pipeline.
     :return: data to be processed by the training step.
     """
     super().next()
     a = iter(self._dataset)
     count = 1
     total = math.ceil(self.num_samples * self.num_files / self.batch_size /
                       self.comm_size)
     for i in a:
         progress(count, total, "Reading TFRecord Data")
         count += 1
         yield i
         yield next(a)
         if count > total:
             break
示例#7
0
 def read(self, epoch_number):
     """
     Opens the CSV dataset and reads the rows in memory.
     :param epoch_number: current epoch number
     """
     super().read(epoch_number)
     packed_array = []
     count = 1
     for file in self._local_file_list:
         progress(count, len(self._local_file_list), "Opening CSV Data")
         count += 1
         rows = pd.read_csv(file, compression="infer").to_numpy()
         packed_array.append({
             'dataset': rows,
             'current_sample': 0,
             'total_samples': len(rows)
         })
     self._dataset = packed_array
示例#8
0
 def generate(self):
     """
     Generate hdf5 data for training. It generates a 3d dataset and writes it to file.
     """
     super().generate()
     samples_per_iter=1024*100
     records = random.random((samples_per_iter, self._dimension, self._dimension))
     record_labels = [0] * self.num_samples
     prev_out_spec = ""
     count = 0
     for i in range(0, int(self.num_files)):
         if i % self.comm_size == self.my_rank:
             progress(i+1, self.num_files, "Generating HDF5 Data")
             out_path_spec = "{}_{}_of_{}.h5".format(self._file_prefix, i+1, self.num_files)
             if count == 0:
                 prev_out_spec = out_path_spec
                 hf = h5py.File(out_path_spec, 'w')
                 chunks = None
                 if self.enable_chunking:
                     chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size)))
                     if chunk_dimension > self._dimension:
                         chunk_dimension = self._dimension
                     chunks = (1, chunk_dimension, chunk_dimension)
                 compression = None
                 compression_level = None
                 if self.compression != Compression.NONE:
                     compression = str(self.compression)
                     if self.compression == Compression.GZIP:
                         compression_level = self.compression_level
                 dset = hf.create_dataset('records', (self.num_samples,self._dimension, self._dimension), chunks=chunks, compression=compression,
                                          compression_opts=compression_level)
                 samples_written = 0
                 while samples_written < self.num_samples:
                     if samples_per_iter < self.num_samples-samples_written:
                         samples_to_write = samples_per_iter
                     else:
                         samples_to_write = self.num_samples-samples_written
                     dset[samples_written:samples_written+samples_to_write] = records[:samples_to_write]
                     samples_written += samples_to_write
                 hf.create_dataset('labels', data=record_labels)
                 hf.close()
                 count += 1
             else:
                 copyfile(prev_out_spec, out_path_spec)
 def generate(self):
     """
     Generator for creating data in TFRecord format of 3d dataset.
     """
     super().generate()
     record = random.random((self._dimension, self._dimension))
     record_label = 0
     prev_out_spec = ""
     count = 0
     for i in range(0, int(self.num_files)):
         if i % self.comm_size == self.my_rank:
             progress(i + 1, self.num_files, "Generating TFRecord Data")
             out_path_spec = "{}_{}_of_{}.tfrecords".format(
                 self._file_prefix, i, self.num_files)
             # Open a TFRecordWriter for the output-file.
             if count == 0:
                 prev_out_spec = out_path_spec
                 with tf.io.TFRecordWriter(out_path_spec) as writer:
                     for i in range(0, self.num_samples):
                         img_bytes = record.tostring()
                         data = {
                             'image':
                             tf.train.Feature(bytes_list=tf.train.BytesList(
                                 value=[img_bytes])),
                             'label':
                             tf.train.Feature(int64_list=tf.train.Int64List(
                                 value=[record_label]))
                         }
                         # Wrap the data as TensorFlow Features.
                         feature = tf.train.Features(feature=data)
                         # Wrap again as a TensorFlow Example.
                         example = tf.train.Example(features=feature)
                         # Serialize the data.
                         serialized = example.SerializeToString()
                         # Write the serialized data to the TFRecords file.
                         writer.write(serialized)
                 count += 1
             else:
                 copyfile(prev_out_spec, out_path_spec)
示例#10
0
 def read(self, epoch_number):
     """
     Reading the hdf5 dataset. Here we take just take the filename and they are open during iteration
     :param epoch_number: epoch number for training loop
     """
     super().read(epoch_number)
     packed_array = []
     count = 1
     for file in self._local_file_list:
         progress(count, len(self._local_file_list), "Opening HDF5 Data")
         count += 1
         file_h5 = h5py.File(file, 'r')
         dimention = int(math.sqrt(self.record_size))
         sample = (dimention, dimention)
         dataset_h = file_h5['records']
         current_sample = 0
         packed_array.append({
             'file': file,
             'sample': sample,
             'current_sample': current_sample
         })
         file_h5.close()
     self._dataset = packed_array