def run(self): """Start the process. Returns ------- None """ # fix seed npr.seed(self._random_seed) # init db self._db = LMDB() self._db.open(self._source) self._db_size = int(self._db.get('size')) self._db_zfill = int(self._db.get('zfill')) self._epoch_size = int(self._db_size / self._num_parts + 1) # search a optimal chunk size by chunks if self._chunk_size == -1: max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20))) min_chunk_size = 1 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 self._chunk_size = min_chunk_size self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 / (self._num_parts * self._chunk_size << 20))) self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1) self._perm = npr.permutation(self._num_shuffle_parts) # init env self.reset() # run while True: self.Q_out.put(self.element()) self.next_record() if self._cur_idx >= self._end_idx: if self._use_shuffle or self._use_step: self.next_chunk() else: self.reset()
class DataReader(Process): """ DataReader is deployed to queue encoded str from `LMDB`_. It is supported to adaptively partition and shuffle records over all distributed nodes. """ def __init__(self, **kwargs): """Construct a ``DataReader``. Parameters ---------- source : str The path of database. shuffle : boolean Whether to shuffle the data. node_step: boolean Whether to split data for multiple parallel nodes. num_chunks : int The number of chunks to split. Default is ``2048``. chunk_size : int The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``). """ super(DataReader, self).__init__() self._source = GetProperty(kwargs, 'source', '') self._use_shuffle = GetProperty(kwargs, 'shuffle', False) self._use_step = GetProperty(kwargs, 'node_step', False) self._num_chunks = GetProperty(kwargs, 'num_chunks', 2048) self._chunk_size = GetProperty(kwargs, 'chunk_size', -1) self._num_parts = 1 self._part_idx = 0 self._random_seed = config.GetRandomSeed() self._cur_idx = 0 self._cur_chunk_idx = 0 self.Q_out = None self.daemon = True def element(self): """Get the value of current record. Returns ------- str The encoded str. """ return self._db.value() def redirect(self, target_idx): """Redirect to the target position. Parameters ---------- target_idx : int The key of instance in ``LMDB``. Returns ------- None Notes ----- The redirection reopens the ``LMDB``. You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``. This will disturb getting stuck when ``Database Size`` >> ``RAM Size``. """ self._db.close() self._db.open(self._source) self._cur_idx = target_idx self._db.set(str(self._cur_idx).zfill(self._db_zfill)) def reset(self): """Reset the cursor and environment. Returns ------- None """ if self._use_shuffle or self._use_step: if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts) self._cur_chunk_idx = 0 self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]) self._start_idx = int(self._start_idx * self._chunk_size) if self._start_idx >= self._db_size: self.next_chunk() self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._db_size, self._end_idx) else: self._start_idx = 0 self._end_idx = self._db_size self.redirect(self._start_idx) def next_record(self): """Step the cursor of records. Returns ------- None """ self._cur_idx += 1 self._db.next() def next_chunk(self): """Step the cursor of shuffling chunks. Returns ------- None """ self._cur_chunk_idx += 1 if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset() else: self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[ self._cur_chunk_idx] self._start_idx = self._start_idx * self._chunk_size if self._start_idx >= self._db_size: self.next_chunk() else: self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._db_size, self._end_idx) self.redirect(self._start_idx) def run(self): """Start the process. Returns ------- None """ # fix seed npr.seed(self._random_seed) # init db self._db = LMDB() self._db.open(self._source) self._db_size = int(self._db.get('size')) self._db_zfill = int(self._db.get('zfill')) self._epoch_size = int(self._db_size / self._num_parts + 1) # search a optimal chunk size by chunks if self._chunk_size == -1: max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20))) min_chunk_size = 1 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 self._chunk_size = min_chunk_size self._num_shuffle_parts = int( math.ceil(self._db._total_size * 1.1 / (self._num_parts * self._chunk_size << 20))) self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1) self._perm = npr.permutation(self._num_shuffle_parts) # init env self.reset() # run while True: self.Q_out.put(self.element()) self.next_record() if self._cur_idx >= self._end_idx: if self._use_shuffle or self._use_step: self.next_chunk() else: self.reset()
def make_db(image_path, label_path, database_path): if os.path.isfile(label_path) is False: raise ValueError('input path is empty or wrong.') if os.path.isdir(database_path) is True: raise ValueError('the database path is already exist.') print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())) db = LMDB(max_commit=10000) db.open(database_path, mode='w') total_line = sum(1 for line in open(label_path)) count = 0 zfill_flag = '{0:0%d}' % (ZFILL) encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 95] start_time = time.time() with open(label_path, 'r') as input_file: for record in input_file: count += 1 if count % 10000 == 0: now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format( count, total_line, now_time - start_time)) db.commit() record = record.split() path = record[0] label = record[1] img = cv2.imread(os.path.join(image_path, path)) result, imgencode = cv2.imencode('.jpg', img, encode_param) datum = caffe_pb2.Datum() datum.height, datum.width, datum.channels = img.shape datum.label = int(label) datum.encoded = True datum.data = imgencode.tostring() db.put(zfill_flag.format(count - 1), datum.SerializeToString()) now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.put('size', wrapper_str(str(count))) db.put('zfill', wrapper_str(str(ZFILL))) db.commit() db.close() shutil.copy(label_path, database_path + '/image_list.txt') end_time = time.time() print('{0} images have been stored in the database.'.format(total_line)) print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time)) print('The size of database is {0} MB.'.format( float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
def make_db(args): if os.path.isfile(args.list) is False: raise ValueError('the path of image list is invalid.') if os.path.isdir(args.database) is True: raise ValueError('the database is already exist or invalid.') print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())) db = LMDB(max_commit=10000) db.open(args.database, mode='w') total_line = sum(1 for line in open(args.list)) count = 0 zfill_flag = '{0:0%d}' % (args.zfill) encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality] start_time = time.time() with open(args.list, 'r') as input_file: records = input_file.readlines() if args.shuffle: import random random.shuffle(records) for record in records: count += 1 if count % 10000 == 0: now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format( count, total_line, now_time - start_time)) db.commit() record = record.split() path = record[0] label = record[1] img = cv2.imread(os.path.join(args.root, path)) if args.resize > 0: img = resize_image(img, args.resize) if args.pad > 0: pad_img = np.zeros((img.shape[0] + 2 * args.pad, img.shape[1] + 2 * args.pad, 3), dtype=img.dtype) pad_img[args.pad:args.pad + img.shape[0], args.pad:args.pad + img.shape[1], :] = img img = pad_img result, imgencode = cv2.imencode('.jpg', img, encode_param) datum = caffe_pb2.Datum() datum.height, datum.width, datum.channels = img.shape datum.label = int(label) datum.encoded = True datum.data = imgencode.tostring() db.put(zfill_flag.format(count - 1), datum.SerializeToString()) now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.put('size', str(count)) db.put('zfill', str(args.zfill)) db.commit() db.close() shutil.copy(args.list, args.database + '/image_list.txt') end_time = time.time() print('{0} images have been stored in the database.'.format(total_line)) print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time)) print('The size of database is {0} MB.'.format( float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
def make_db(args): """Make the sequential database for images. Parameters ---------- database : str The path of database. root : str The root folder of raw images. list : str The path of image list file. resize : int The size of the shortest edge. Default is ``0`` (Disabled). zfill : int The number of zeros for encoding keys. quality : int JPEG quality for encoding, 1-100. Default is ``95``. shuffle : boolean Whether to randomize the order in list file. """ if os.path.isfile(args.list) is False: raise ValueError('the path of image list is invalid.') if os.path.isdir(args.database) is True: raise ValueError('the database is already exist or invalid.') print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())) db = LMDB(max_commit=10000) db.open(args.database, mode='w') total_line = sum(1 for line in open(args.list)) count = 0 zfill_flag = '{0:0%d}' % (args.zfill) encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality] start_time = time.time() with open(args.list, 'r') as input_file: records = input_file.readlines() if args.shuffle: import random random.shuffle(records) for record in records: count += 1 if count % 10000 == 0: now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format( count, total_line, now_time - start_time)) db.commit() record = record.split() path = record[0] label = record[1] img = cv2.imread(os.path.join(args.root, path)) if args.resize > 0: img = resize_image(img, args.resize) result, imgencode = cv2.imencode('.jpg', img, encode_param) datum = caffe_pb2.Datum() datum.height, datum.width, datum.channels = img.shape datum.label = int(label) datum.encoded = True datum.data = imgencode.tostring() db.put(zfill_flag.format(count - 1), datum.SerializeToString()) now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.put('size', str(count)) db.put('zfill', str(args.zfill)) db.commit() db.close() shutil.copy(args.list, args.database + '/image_list.txt') end_time = time.time() print('{0} images have been stored in the database.'.format(total_line)) print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time)) print('The size of database is {0} MB.'.format( float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
def make_db(args): """Make the sequential database for images. Parameters ---------- database : str The path of database. root : str The root folder of raw images. list : str The path of image list file. resize : int The size of the shortest edge. Default is ``0`` (Disabled). zfill : int The number of zeros for encoding keys. quality : int JPEG quality for encoding, 1-100. Default is ``95``. shuffle : boolean Whether to randomize the order in list file. """ if os.path.isfile(args.list) is False: raise ValueError('the path of image list is invalid.') if os.path.isdir(args.database) is True: raise ValueError('the database is already exist or invalid.') print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())) db = LMDB(max_commit=10000) db.open(args.database, mode='w') total_line = sum(1 for line in open(args.list)) count = 0 zfill_flag = '{0:0%d}' % (args.zfill) encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality] start_time = time.time() with open(args.list, 'r') as input_file: records = input_file.readlines() if args.shuffle: import random random.shuffle(records) for record in records: count += 1 if count % 10000 == 0: now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format( count, total_line, now_time - start_time)) db.commit() record = record.split() path = record[0] label = record[1] img = cv2.imread(os.path.join(args.root, path)) if args.resize > 0: img = resize_image(img, args.resize) result, imgencode = cv2.imencode('.jpg', img, encode_param) datum = caffe_pb2.Datum() datum.height, datum.width, datum.channels = img.shape datum.label = int(label) datum.encoded = True datum.data = imgencode.tostring() db.put(zfill_flag.format(count - 1), datum.SerializeToString()) now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.put('size', str(count)) db.put('zfill', str(args.zfill)) db.commit() db.close() shutil.copy(args.list, args.database + '/image_list.txt') end_time = time.time() print('{0} images have been stored in the database.'.format(total_line)) print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time)) print('The size of database is {0} MB.'. format(float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
class DataReader(Process): """DataReader is deployed to queue encoded str from `LMDB`_. It is supported to adaptively partition and shuffle records over all distributed nodes. """ def __init__(self, **kwargs): """Construct a ``DataReader``. Parameters ---------- source : str The path of database. multiple_nodes: boolean, optional, default=False Whether to split data for multiple parallel nodes. shuffle : bool, optional, default=False Whether to shuffle the data. num_chunks : int, optional, default=2048 The number of chunks to split. chunk_size : int, optional, default=-1 The size(MB) of each chunk. """ super(DataReader, self).__init__() self._source = kwargs.get('source', '') self._multiple_nodes = kwargs.get('multiple_nodes', False) self._use_shuffle = kwargs.get('shuffle', False) self._use_instance_chunk = kwargs.get('instance_chunk', False) self._num_chunks = kwargs.get('num_chunks', 2048) self._chunk_size = kwargs.get('chunk_size', -1) self._part_idx, self._num_parts = 0, 1 self._cur_idx, self._cur_chunk_idx = 0, 0 self._random_seed = config.GetRandomSeed() self.Q_out = None self.daemon = True def element(self): """Get the value of current record. Returns ------- str The encoded str. """ return self._db.value() def redirect(self, target_idx): """Redirect to the target position. Parameters ---------- target_idx : int The key of instance in ``LMDB``. Returns ------- None Notes ----- The redirection reopens the ``LMDB``. You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``. This will disturb getting stuck when ``Database Size`` >> ``RAM Size``. """ self._db.close() self._db.open(self._source) self._cur_idx = target_idx self._db.set(str(self._cur_idx).zfill(self._zfill)) def reset(self): """Reset the cursor and environment. Returns ------- None """ if self._multiple_nodes or self._use_shuffle: if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts) self._cur_chunk_idx = 0 self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]) self._start_idx = int(self._start_idx * self._chunk_size) if self._start_idx >= self._num_entries: self.next_chunk() self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._num_entries, self._end_idx) else: self._start_idx = 0 self._end_idx = self._num_entries self.redirect(self._start_idx) def next_record(self): """Step the cursor of records. Returns ------- None """ self._cur_idx += 1 self._db.next() def next_chunk(self): """Step the cursor of shuffling chunks. Returns ------- None """ self._cur_chunk_idx += 1 if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset() else: self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[ self._cur_chunk_idx] self._start_idx = self._start_idx * self._chunk_size if self._start_idx >= self._num_entries: self.next_chunk() else: self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._num_entries, self._end_idx) self.redirect(self._start_idx) def run(self): """Start the process. Returns ------- None """ # fix seed npr.seed(self._random_seed) # init db self._db = LMDB() self._db.open(self._source) self._zfill = self._db.zfill() self._num_entries = self._db.num_entries() self._epoch_size = int(self._num_entries / self._num_parts + 1) if self._use_shuffle: if self._chunk_size == 1: # Each chunk has at most 1 record [For Fully Shuffle] self._chunk_size, self._num_shuffle_parts = \ 1, int(self._num_entries / self._num_parts) + 1 else: if self._use_shuffle and self._chunk_size == -1: # Search a optimal chunk size by chunks [For Chunk Shuffle] max_chunk_size = self._db._total_size / ( (self._num_chunks * (1 << 20))) min_chunk_size = 1 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 self._chunk_size = min_chunk_size self._num_shuffle_parts = int( math.ceil(self._db._total_size * 1.1 / (self._num_parts * self._chunk_size << 20))) self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1) limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size if self._num_entries <= limit: # Roll back to fully shuffle self._chunk_size, self._num_shuffle_parts = \ 1, int(self._num_entries / self._num_parts) + 1 else: # Each chunk has at most K records [For Multiple Nodes] # Note that if ``shuffle`` and ``multiple_nodes`` are all ``False``, # ``chunk_size`` and ``num_shuffle_parts`` are meaningless self._chunk_size = int(self._num_entries / self._num_parts) + 1 self._num_shuffle_parts = 1 self._perm = np.arange(self._num_shuffle_parts) # Init env self.reset() # Run! while True: self.Q_out.put(self.element()) self.next_record() if self._cur_idx >= self._end_idx: if self._multiple_nodes or \ self._use_shuffle: self.next_chunk() else: self.reset()
def run(self): """Start the process. Returns ------- None """ # fix seed npr.seed(self._random_seed) # init db self._db = LMDB() self._db.open(self._source) self._zfill = self._db.zfill() self._num_entries = self._db.num_entries() self._epoch_size = int(self._num_entries / self._num_parts + 1) if self._use_shuffle: if self._chunk_size == 1: # Each chunk has at most 1 record [For Fully Shuffle] self._chunk_size, self._num_shuffle_parts = \ 1, int(self._num_entries / self._num_parts) + 1 else: if self._use_shuffle and self._chunk_size == -1: # Search a optimal chunk size by chunks [For Chunk Shuffle] max_chunk_size = self._db._total_size / ( (self._num_chunks * (1 << 20))) min_chunk_size = 1 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 self._chunk_size = min_chunk_size self._num_shuffle_parts = int( math.ceil(self._db._total_size * 1.1 / (self._num_parts * self._chunk_size << 20))) self._chunk_size = int(self._num_entries / self._num_shuffle_parts / self._num_parts + 1) limit = (self._num_parts - 0.5) * self._num_shuffle_parts * self._chunk_size if self._num_entries <= limit: # Roll back to fully shuffle self._chunk_size, self._num_shuffle_parts = \ 1, int(self._num_entries / self._num_parts) + 1 else: # Each chunk has at most K records [For Multiple Nodes] # Note that if ``shuffle`` and ``multiple_nodes`` are all ``False``, # ``chunk_size`` and ``num_shuffle_parts`` are meaningless self._chunk_size = int(self._num_entries / self._num_parts) + 1 self._num_shuffle_parts = 1 self._perm = np.arange(self._num_shuffle_parts) # Init env self.reset() # Run! while True: self.Q_out.put(self.element()) self.next_record() if self._cur_idx >= self._end_idx: if self._multiple_nodes or \ self._use_shuffle: self.next_chunk() else: self.reset()
def get_db_size(self): self._db = LMDB() self._db.open(self._source) db_size = int(self._db.get('size')) self._db.close() return db_size
def make_db(images_list, database_path): if os.path.isdir(database_path) is True: raise ValueError('the database path is already exist.') print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())) db = LMDB(max_commit=10000) db.open(database_path, mode='w') total_line = len(images_list) count = 0 zfill_flag = '{0:0%d}' % (ZFILL) start_time = time.time() for record in images_list: count += 1 if count % 10000 == 0: now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format( count, total_line, now_time - start_time)) db.commit() img = record[0] label = record[1] datum = caffe_pb2.Datum() datum.height, datum.width, datum.channels = img.shape datum.label = int(label) datum.encoded = False datum.data = img.tostring() db.put(zfill_flag.format(count - 1), datum.SerializeToString()) now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.put('size', wrapper_str(str(count))) db.put('zfill', wrapper_str(str(ZFILL))) db.commit() db.close() end_time = time.time() print('{0} images have been stored in the database.'.format(total_line)) print('This task finishes within {0:.2f} seconds.'.format( end_time - start_time)) print('The size of database is {0} MB.'.format( float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
def make_db(images_list, database_path, pad=0): if os.path.isdir(database_path) is True: raise ValueError('the database path is already exist.') print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())) db = LMDB(max_commit=10000) db.open(database_path, mode='w') total_line = len(images_list) count = 0 zfill_flag = '{0:0%d}' % (ZFILL) start_time = time.time() for record in images_list: count += 1 if count % 10000 == 0: now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.commit() img = record[0] label = record[1] if pad > 0: pad_img = np.zeros( (img.shape[0] + 2 * pad, img.shape[1] + 2 * pad, 3), dtype=np.uint8) pad_img[pad:pad + img.shape[0], pad:pad + img.shape[1], :] = img img = pad_img datum = caffe_pb2.Datum() datum.height, datum.width, datum.channels = img.shape datum.label = int(label) datum.encoded = False datum.data = img.tostring() db.put(zfill_flag.format(count - 1), datum.SerializeToString()) now_time = time.time() print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time)) db.put('size', wrapper_str(str(count))) db.put('zfill', wrapper_str(str(ZFILL))) db.commit() db.close() end_time = time.time() print('{0} images have been stored in the database.'.format(total_line)) print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time)) print('The size of database is {0} MB.'.format( float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
class DataReader(Process): def __init__(self, **kwargs): super(DataReader, self).__init__() self._source = GetProperty(kwargs, 'source', '') self._use_shuffle = GetProperty(kwargs, 'shuffle', False) self._use_step = GetProperty(kwargs, 'node_step', False) self._num_chunks = GetProperty(kwargs, 'num_chunks', 2048) self._chunk_size = GetProperty(kwargs, 'chunk_size', -1) self._num_parts = 1 self._part_idx = 0 self._random_seed = config.GetRandomSeed() self._cur_idx = 0 self._cur_chunk_idx = 0 self.Q_out = None self.daemon = True def cleanup(): logger.info('Terminating DataReader......') self.terminate() self.join() import atexit atexit.register(cleanup) def element(self): return self._db.value() def reset(self): if self._use_shuffle: self._cur_chunk_idx = 0 self._perm = npr.permutation(self._num_shuffle_parts) self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]) self._start_idx = int(self._start_idx * self._chunk_size) if self._start_idx >= self._db_size: self.next_chunk() self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._db_size, self._end_idx) #self._part_idx = (self._part_idx + 1) % self._num_parts # fast hard disk driver is required elif self._use_step: self._start_idx = self._part_idx * self._epoch_size self._end_idx = self._start_idx + self._epoch_size self._end_idx = min(self._db_size, self._end_idx) #self._part_idx = (self._part_idx + 1) % self._num_parts # fast hard disk driver is required else: self._start_idx = 0 self._end_idx = self._db_size self._cur_idx = self._start_idx self._db.set(str(self._cur_idx).zfill(self._db_zfill)) def next_record(self): self._cur_idx += 1 self._db.next() def next_chunk(self): self._cur_chunk_idx += 1 if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset() else: self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[ self._cur_chunk_idx] self._start_idx = self._start_idx * self._chunk_size if self._start_idx >= self._db_size: self.next_chunk() else: self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._db_size, self._end_idx) self._cur_idx = self._start_idx self._db.set(str(self._cur_idx).zfill(self._db_zfill)) def run(self): # fix seed npr.seed(self._random_seed) # init db self._db = LMDB() self._db.open(self._source) self._db_size = int(self._db.get('size')) self._db_zfill = int(self._db.get('zfill')) self._epoch_size = int(self._db_size / self._num_parts + 1) # search a optimal chunk size by chunks if self._chunk_size == -1: max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20))) min_chunk_size = 1 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 self._chunk_size = min_chunk_size self._num_shuffle_parts = int( math.ceil(self._db._total_size * 1.1 / (self._num_parts * self._chunk_size << 20))) self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1) # init env self.reset() # run ! while True: self.Q_out.put(self.element()) self.next_record() if self._cur_idx >= self._end_idx: if self._use_shuffle: self.next_chunk() else: self.reset()
class DataReader(Process): """ DataReader is deployed to queue encoded str from `LMDB`_. It is supported to adaptively partition and shuffle records over all distributed nodes. """ def __init__(self, **kwargs): """Construct a ``DataReader``. Parameters ---------- source : str The path of database. shuffle : boolean Whether to shuffle the data. node_step: boolean Whether to split data for multiple parallel nodes. num_chunks : int The number of chunks to split. Default is ``2048``. chunk_size : int The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``). """ super(DataReader, self).__init__() self._source = GetProperty(kwargs, 'source', '') self._use_shuffle = GetProperty(kwargs, 'shuffle', False) self._use_step = GetProperty(kwargs, 'node_step', False) self._num_chunks = GetProperty(kwargs, 'num_chunks', 2048) self._chunk_size = GetProperty(kwargs, 'chunk_size', -1) self._num_parts = 1 self._part_idx = 0 self._random_seed = config.GetRandomSeed() self._cur_idx = 0 self._cur_chunk_idx = 0 self.Q_out = None self.daemon = True def element(self): """Get the value of current record. Returns ------- str The encoded str. """ return self._db.value() def redirect(self, target_idx): """Redirect to the target position. Parameters ---------- target_idx : int The key of instance in ``LMDB``. Returns ------- None Notes ----- The redirection reopens the ``LMDB``. You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``. This will disturb getting stuck when ``Database Size`` >> ``RAM Size``. """ self._db.close() self._db.open(self._source) self._cur_idx = target_idx self._db.set(str(self._cur_idx).zfill(self._db_zfill)) def reset(self): """Reset the cursor and environment. Returns ------- None """ if self._use_shuffle or self._use_step: if self._use_shuffle: self._perm = npr.permutation(self._num_shuffle_parts) self._cur_chunk_idx = 0 self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]) self._start_idx = int(self._start_idx * self._chunk_size) if self._start_idx >= self._db_size: self.next_chunk() self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._db_size, self._end_idx) else: self._start_idx = 0 self._end_idx = self._db_size self.redirect(self._start_idx) def next_record(self): """Step the cursor of records. Returns ------- None """ self._cur_idx += 1 self._db.next() def next_chunk(self): """Step the cursor of shuffling chunks. Returns ------- None """ self._cur_chunk_idx += 1 if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset() else: self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx] self._start_idx = self._start_idx * self._chunk_size if self._start_idx >= self._db_size: self.next_chunk() else: self._end_idx = self._start_idx + self._chunk_size self._end_idx = min(self._db_size, self._end_idx) self.redirect(self._start_idx) def run(self): """Start the process. Returns ------- None """ # fix seed npr.seed(self._random_seed) # init db self._db = LMDB() self._db.open(self._source) self._db_size = int(self._db.get('size')) self._db_zfill = int(self._db.get('zfill')) self._epoch_size = int(self._db_size / self._num_parts + 1) # search a optimal chunk size by chunks if self._chunk_size == -1: max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20))) min_chunk_size = 1 while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2 self._chunk_size = min_chunk_size self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 / (self._num_parts * self._chunk_size << 20))) self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1) self._perm = npr.permutation(self._num_shuffle_parts) # init env self.reset() # run while True: self.Q_out.put(self.element()) self.next_record() if self._cur_idx >= self._end_idx: if self._use_shuffle or self._use_step: self.next_chunk() else: self.reset()