Пример #1
0
    def run(self):
        """Start the process.

        Returns
        -------
        None

        """
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._db_size = int(self._db.get('size'))
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)
        # search a optimal chunk size by chunks
        if self._chunk_size == -1:
            max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
            min_chunk_size = 1
            while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
            self._chunk_size = min_chunk_size
        self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                               (self._num_parts * self._chunk_size << 20)))
        self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
        self._perm = npr.permutation(self._num_shuffle_parts)

        # init env
        self.reset()

        # run
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._use_shuffle or self._use_step: self.next_chunk()
                else: self.reset()
Пример #2
0
class DataReader(Process):
    """
    DataReader is deployed to queue encoded str from `LMDB`_.

    It is supported to adaptively partition and shuffle records over all distributed nodes.
    """
    def __init__(self, **kwargs):
        """Construct a ``DataReader``.

        Parameters
        ----------
        source : str
            The path of database.
        shuffle : boolean
            Whether to shuffle the data.
        node_step: boolean
            Whether to split data for multiple parallel nodes.
        num_chunks : int
            The number of chunks to split. Default is ``2048``.
        chunk_size : int
            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).

        """
        super(DataReader, self).__init__()
        self._source = GetProperty(kwargs, 'source', '')
        self._use_shuffle = GetProperty(kwargs, 'shuffle', False)
        self._use_step = GetProperty(kwargs, 'node_step', False)
        self._num_chunks = GetProperty(kwargs, 'num_chunks', 2048)
        self._chunk_size = GetProperty(kwargs, 'chunk_size', -1)

        self._num_parts = 1
        self._part_idx = 0
        self._random_seed = config.GetRandomSeed()

        self._cur_idx = 0
        self._cur_chunk_idx = 0

        self.Q_out = None
        self.daemon = True

    def element(self):
        """Get the value of current record.

        Returns
        -------
        str
            The encoded str.

        """
        return self._db.value()

    def redirect(self, target_idx):
        """Redirect to the target position.

        Parameters
        ----------
        target_idx : int
            The key of instance in ``LMDB``.

        Returns
        -------
        None

        Notes
        -----
        The redirection reopens the ``LMDB``.

        You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.

        This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.

        """
        self._db.close()
        self._db.open(self._source)
        self._cur_idx = target_idx
        self._db.set(str(self._cur_idx).zfill(self._db_zfill))

    def reset(self):
        """Reset the cursor and environment.

        Returns
        -------
        None

        """
        if self._use_shuffle or self._use_step:
            if self._use_shuffle:
                self._perm = npr.permutation(self._num_shuffle_parts)
            self._cur_chunk_idx = 0
            self._start_idx = int(self._part_idx * self._num_shuffle_parts +
                                  self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
            if self._start_idx >= self._db_size: self.next_chunk()
            self._end_idx = self._start_idx + self._chunk_size
            self._end_idx = min(self._db_size, self._end_idx)
        else:
            self._start_idx = 0
            self._end_idx = self._db_size

        self.redirect(self._start_idx)

    def next_record(self):
        """Step the cursor of records.

        Returns
        -------
        None

        """
        self._cur_idx += 1
        self._db.next()

    def next_chunk(self):
        """Step the cursor of shuffling chunks.

        Returns
        -------
        None

        """
        self._cur_chunk_idx += 1
        if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
        else:
            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[
                self._cur_chunk_idx]
            self._start_idx = self._start_idx * self._chunk_size
            if self._start_idx >= self._db_size: self.next_chunk()
            else:
                self._end_idx = self._start_idx + self._chunk_size
                self._end_idx = min(self._db_size, self._end_idx)
            self.redirect(self._start_idx)

    def run(self):
        """Start the process.

        Returns
        -------
        None

        """
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._db_size = int(self._db.get('size'))
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)
        # search a optimal chunk size by chunks
        if self._chunk_size == -1:
            max_chunk_size = self._db._total_size / ((self._num_chunks *
                                                      (1 << 20)))
            min_chunk_size = 1
            while min_chunk_size * 2 < max_chunk_size:
                min_chunk_size *= 2
            self._chunk_size = min_chunk_size
        self._num_shuffle_parts = int(
            math.ceil(self._db._total_size * 1.1 /
                      (self._num_parts * self._chunk_size << 20)))
        self._chunk_size = int(self._db_size / self._num_shuffle_parts /
                               self._num_parts + 1)
        self._perm = npr.permutation(self._num_shuffle_parts)

        # init env
        self.reset()

        # run
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._use_shuffle or self._use_step: self.next_chunk()
                else: self.reset()
Пример #3
0
def make_db(image_path, label_path, database_path):
    if os.path.isfile(label_path) is False:
        raise ValueError('input path is empty or wrong.')
    if os.path.isdir(database_path) is True:
        raise ValueError('the database path is already exist.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(database_path, mode='w')

    total_line = sum(1 for line in open(label_path))
    count = 0
    zfill_flag = '{0:0%d}' % (ZFILL)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 95]

    start_time = time.time()

    with open(label_path, 'r') as input_file:
        for record in input_file:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(image_path, path))
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', wrapper_str(str(count)))
    db.put('zfill', wrapper_str(str(ZFILL)))
    db.commit()
    db.close()

    shutil.copy(label_path, database_path + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
Пример #4
0
def make_db(args):
    if os.path.isfile(args.list) is False:
        raise ValueError('the path of image list is invalid.')
    if os.path.isdir(args.database) is True:
        raise ValueError('the database is already exist or invalid.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(args.database, mode='w')

    total_line = sum(1 for line in open(args.list))
    count = 0
    zfill_flag = '{0:0%d}' % (args.zfill)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]

    start_time = time.time()

    with open(args.list, 'r') as input_file:
        records = input_file.readlines()
        if args.shuffle:
            import random
            random.shuffle(records)

        for record in records:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(args.root, path))
            if args.resize > 0:
                img = resize_image(img, args.resize)
            if args.pad > 0:
                pad_img = np.zeros((img.shape[0] + 2 * args.pad,
                                    img.shape[1] + 2 * args.pad, 3),
                                   dtype=img.dtype)
                pad_img[args.pad:args.pad + img.shape[0],
                        args.pad:args.pad + img.shape[1], :] = img
                img = pad_img
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', str(count))
    db.put('zfill', str(args.zfill))
    db.commit()
    db.close()

    shutil.copy(args.list, args.database + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
Пример #5
0
def make_db(args):
    """Make the sequential database for images.

    Parameters
    ----------
    database : str
        The path of database.
    root : str
        The root folder of raw images.
    list : str
        The path of image list file.
    resize : int
        The size of the shortest edge. Default is ``0`` (Disabled).
    zfill : int
        The number of zeros for encoding keys.
    quality : int
        JPEG quality for encoding, 1-100. Default is ``95``.
    shuffle : boolean
        Whether to randomize the order in list file.

    """
    if os.path.isfile(args.list) is False:
        raise ValueError('the path of image list is invalid.')
    if os.path.isdir(args.database) is True:
        raise ValueError('the database is already exist or invalid.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(args.database, mode='w')

    total_line = sum(1 for line in open(args.list))
    count = 0
    zfill_flag = '{0:0%d}' % (args.zfill)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]

    start_time = time.time()

    with open(args.list, 'r') as input_file:
        records = input_file.readlines()
        if args.shuffle:
            import random
            random.shuffle(records)

        for record in records:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(args.root, path))
            if args.resize > 0:
                img = resize_image(img, args.resize)
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', str(count))
    db.put('zfill', str(args.zfill))
    db.commit()
    db.close()

    shutil.copy(args.list, args.database + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
Пример #6
0
def make_db(args):
    """Make the sequential database for images.

    Parameters
    ----------
    database : str
        The path of database.
    root : str
        The root folder of raw images.
    list : str
        The path of image list file.
    resize : int
        The size of the shortest edge. Default is ``0`` (Disabled).
    zfill : int
        The number of zeros for encoding keys.
    quality : int
        JPEG quality for encoding, 1-100. Default is ``95``.
    shuffle : boolean
        Whether to randomize the order in list file.

    """
    if os.path.isfile(args.list) is False:
        raise ValueError('the path of image list is invalid.')
    if os.path.isdir(args.database) is True:
        raise ValueError('the database is already exist or invalid.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(args.database, mode='w')

    total_line = sum(1 for line in open(args.list))
    count = 0
    zfill_flag = '{0:0%d}' % (args.zfill)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]

    start_time = time.time()

    with open(args.list, 'r') as input_file:
        records = input_file.readlines()
        if args.shuffle:
            import random
            random.shuffle(records)

        for record in records:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(args.root, path))
            if args.resize > 0:
                img = resize_image(img, args.resize)
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time))
    db.put('size', str(count))
    db.put('zfill', str(args.zfill))
    db.commit()
    db.close()

    shutil.copy(args.list, args.database + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time - start_time))
    print('The size of database is {0} MB.'.
          format(float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
Пример #7
0
class DataReader(Process):
    """DataReader is deployed to queue encoded str from `LMDB`_.

    It is supported to adaptively partition and shuffle records over all distributed nodes.

    """
    def __init__(self, **kwargs):
        """Construct a ``DataReader``.

        Parameters
        ----------
        source : str
            The path of database.
        multiple_nodes: boolean, optional, default=False
            Whether to split data for multiple parallel nodes.
        shuffle : bool, optional, default=False
            Whether to shuffle the data.
        num_chunks : int, optional, default=2048
            The number of chunks to split.
        chunk_size : int, optional, default=-1
            The size(MB) of each chunk.

        """
        super(DataReader, self).__init__()
        self._source = kwargs.get('source', '')
        self._multiple_nodes = kwargs.get('multiple_nodes', False)
        self._use_shuffle = kwargs.get('shuffle', False)
        self._use_instance_chunk = kwargs.get('instance_chunk', False)
        self._num_chunks = kwargs.get('num_chunks', 2048)
        self._chunk_size = kwargs.get('chunk_size', -1)

        self._part_idx, self._num_parts = 0, 1
        self._cur_idx, self._cur_chunk_idx = 0, 0
        self._random_seed = config.GetRandomSeed()

        self.Q_out = None
        self.daemon = True

    def element(self):
        """Get the value of current record.

        Returns
        -------
        str
            The encoded str.

        """
        return self._db.value()

    def redirect(self, target_idx):
        """Redirect to the target position.

        Parameters
        ----------
        target_idx : int
            The key of instance in ``LMDB``.

        Returns
        -------
        None

        Notes
        -----
        The redirection reopens the ``LMDB``.

        You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.

        This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.

        """
        self._db.close()
        self._db.open(self._source)
        self._cur_idx = target_idx
        self._db.set(str(self._cur_idx).zfill(self._zfill))

    def reset(self):
        """Reset the cursor and environment.

        Returns
        -------
        None

        """
        if self._multiple_nodes or self._use_shuffle:
            if self._use_shuffle:
                self._perm = npr.permutation(self._num_shuffle_parts)
            self._cur_chunk_idx = 0
            self._start_idx = int(self._part_idx * self._num_shuffle_parts +
                                  self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
            if self._start_idx >= self._num_entries: self.next_chunk()
            self._end_idx = self._start_idx + self._chunk_size
            self._end_idx = min(self._num_entries, self._end_idx)
        else:
            self._start_idx = 0
            self._end_idx = self._num_entries

        self.redirect(self._start_idx)

    def next_record(self):
        """Step the cursor of records.

        Returns
        -------
        None

        """
        self._cur_idx += 1
        self._db.next()

    def next_chunk(self):
        """Step the cursor of shuffling chunks.

        Returns
        -------
        None

        """
        self._cur_chunk_idx += 1
        if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
        else:
            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[
                self._cur_chunk_idx]
            self._start_idx = self._start_idx * self._chunk_size
            if self._start_idx >= self._num_entries: self.next_chunk()
            else:
                self._end_idx = self._start_idx + self._chunk_size
                self._end_idx = min(self._num_entries, self._end_idx)
            self.redirect(self._start_idx)

    def run(self):
        """Start the process.

        Returns
        -------
        None

        """
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._zfill = self._db.zfill()
        self._num_entries = self._db.num_entries()
        self._epoch_size = int(self._num_entries / self._num_parts + 1)

        if self._use_shuffle:
            if self._chunk_size == 1:
                # Each chunk has at most 1 record [For Fully Shuffle]
                self._chunk_size, self._num_shuffle_parts = \
                    1, int(self._num_entries / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # Search a optimal chunk size by chunks [For Chunk Shuffle]
                    max_chunk_size = self._db._total_size / (
                        (self._num_chunks * (1 << 20)))
                    min_chunk_size = 1
                    while min_chunk_size * 2 < max_chunk_size:
                        min_chunk_size *= 2
                    self._chunk_size = min_chunk_size
                    self._num_shuffle_parts = int(
                        math.ceil(self._db._total_size * 1.1 /
                                  (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._num_entries /
                                           self._num_shuffle_parts /
                                           self._num_parts + 1)
                    limit = (self._num_parts -
                             0.5) * self._num_shuffle_parts * self._chunk_size
                    if self._num_entries <= limit:
                        # Roll back to fully shuffle
                        self._chunk_size, self._num_shuffle_parts = \
                            1, int(self._num_entries / self._num_parts) + 1
        else:
            # Each chunk has at most K records [For Multiple Nodes]
            # Note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
            self._chunk_size = int(self._num_entries / self._num_parts) + 1
            self._num_shuffle_parts = 1

        self._perm = np.arange(self._num_shuffle_parts)

        # Init env
        self.reset()

        # Run!
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._multiple_nodes or \
                    self._use_shuffle:
                    self.next_chunk()
                else:
                    self.reset()
Пример #8
0
    def run(self):
        """Start the process.

        Returns
        -------
        None

        """
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._zfill = self._db.zfill()
        self._num_entries = self._db.num_entries()
        self._epoch_size = int(self._num_entries / self._num_parts + 1)

        if self._use_shuffle:
            if self._chunk_size == 1:
                # Each chunk has at most 1 record [For Fully Shuffle]
                self._chunk_size, self._num_shuffle_parts = \
                    1, int(self._num_entries / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # Search a optimal chunk size by chunks [For Chunk Shuffle]
                    max_chunk_size = self._db._total_size / (
                        (self._num_chunks * (1 << 20)))
                    min_chunk_size = 1
                    while min_chunk_size * 2 < max_chunk_size:
                        min_chunk_size *= 2
                    self._chunk_size = min_chunk_size
                    self._num_shuffle_parts = int(
                        math.ceil(self._db._total_size * 1.1 /
                                  (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._num_entries /
                                           self._num_shuffle_parts /
                                           self._num_parts + 1)
                    limit = (self._num_parts -
                             0.5) * self._num_shuffle_parts * self._chunk_size
                    if self._num_entries <= limit:
                        # Roll back to fully shuffle
                        self._chunk_size, self._num_shuffle_parts = \
                            1, int(self._num_entries / self._num_parts) + 1
        else:
            # Each chunk has at most K records [For Multiple Nodes]
            # Note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
            self._chunk_size = int(self._num_entries / self._num_parts) + 1
            self._num_shuffle_parts = 1

        self._perm = np.arange(self._num_shuffle_parts)

        # Init env
        self.reset()

        # Run!
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._multiple_nodes or \
                    self._use_shuffle:
                    self.next_chunk()
                else:
                    self.reset()
Пример #9
0
 def get_db_size(self):
     self._db = LMDB()
     self._db.open(self._source)
     db_size = int(self._db.get('size'))
     self._db.close()
     return db_size
Пример #10
0
def make_db(images_list, database_path):
    if os.path.isdir(database_path) is True:
        raise ValueError('the database path is already exist.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(database_path, mode='w')

    total_line = len(images_list)
    count = 0
    zfill_flag = '{0:0%d}' % (ZFILL)

    start_time = time.time()

    for record in images_list:
        count += 1
        if count % 10000 == 0:
            now_time = time.time()
            print('{0} / {1} in {2:.2f} sec'.format(
                count, total_line, now_time - start_time))
            db.commit()

        img = record[0]
        label = record[1]

        datum = caffe_pb2.Datum()
        datum.height, datum.width, datum.channels = img.shape
        datum.label = int(label)
        datum.encoded = False
        datum.data = img.tostring()
        db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line, now_time - start_time))
    db.put('size', wrapper_str(str(count)))
    db.put('zfill', wrapper_str(str(ZFILL)))
    db.commit()
    db.close()

    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(
        end_time - start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
Пример #11
0
def make_db(images_list, database_path, pad=0):
    if os.path.isdir(database_path) is True:
        raise ValueError('the database path is already exist.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(database_path, mode='w')

    total_line = len(images_list)
    count = 0
    zfill_flag = '{0:0%d}' % (ZFILL)

    start_time = time.time()

    for record in images_list:
        count += 1
        if count % 10000 == 0:
            now_time = time.time()
            print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                                    now_time - start_time))
            db.commit()

        img = record[0]
        label = record[1]
        if pad > 0:
            pad_img = np.zeros(
                (img.shape[0] + 2 * pad, img.shape[1] + 2 * pad, 3),
                dtype=np.uint8)
            pad_img[pad:pad + img.shape[0], pad:pad + img.shape[1], :] = img
            img = pad_img

        datum = caffe_pb2.Datum()
        datum.height, datum.width, datum.channels = img.shape
        datum.label = int(label)
        datum.encoded = False
        datum.data = img.tostring()
        db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', wrapper_str(str(count)))
    db.put('zfill', wrapper_str(str(ZFILL)))
    db.commit()
    db.close()

    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
Пример #12
0
class DataReader(Process):
    def __init__(self, **kwargs):
        super(DataReader, self).__init__()
        self._source = GetProperty(kwargs, 'source', '')
        self._use_shuffle = GetProperty(kwargs, 'shuffle', False)
        self._use_step = GetProperty(kwargs, 'node_step', False)
        self._num_chunks = GetProperty(kwargs, 'num_chunks', 2048)
        self._chunk_size = GetProperty(kwargs, 'chunk_size', -1)

        self._num_parts = 1
        self._part_idx = 0
        self._random_seed = config.GetRandomSeed()

        self._cur_idx = 0
        self._cur_chunk_idx = 0

        self.Q_out = None
        self.daemon = True

        def cleanup():
            logger.info('Terminating DataReader......')
            self.terminate()
            self.join()

        import atexit
        atexit.register(cleanup)

    def element(self):
        return self._db.value()

    def reset(self):
        if self._use_shuffle:
            self._cur_chunk_idx = 0
            self._perm = npr.permutation(self._num_shuffle_parts)
            self._start_idx = int(self._part_idx * self._num_shuffle_parts +
                                  self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
            if self._start_idx >= self._db_size: self.next_chunk()
            self._end_idx = self._start_idx + self._chunk_size
            self._end_idx = min(self._db_size, self._end_idx)
            #self._part_idx = (self._part_idx + 1) % self._num_parts  # fast hard disk driver is required

        elif self._use_step:
            self._start_idx = self._part_idx * self._epoch_size
            self._end_idx = self._start_idx + self._epoch_size
            self._end_idx = min(self._db_size, self._end_idx)
            #self._part_idx = (self._part_idx + 1) % self._num_parts  # fast hard disk driver is required
        else:
            self._start_idx = 0
            self._end_idx = self._db_size

        self._cur_idx = self._start_idx
        self._db.set(str(self._cur_idx).zfill(self._db_zfill))

    def next_record(self):
        self._cur_idx += 1
        self._db.next()

    def next_chunk(self):
        self._cur_chunk_idx += 1
        if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
        else:
            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[
                self._cur_chunk_idx]
            self._start_idx = self._start_idx * self._chunk_size
            if self._start_idx >= self._db_size: self.next_chunk()
            else:
                self._end_idx = self._start_idx + self._chunk_size
                self._end_idx = min(self._db_size, self._end_idx)
            self._cur_idx = self._start_idx
            self._db.set(str(self._cur_idx).zfill(self._db_zfill))

    def run(self):
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._db_size = int(self._db.get('size'))
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)
        # search a optimal chunk size by chunks
        if self._chunk_size == -1:
            max_chunk_size = self._db._total_size / ((self._num_chunks *
                                                      (1 << 20)))
            min_chunk_size = 1
            while min_chunk_size * 2 < max_chunk_size:
                min_chunk_size *= 2
            self._chunk_size = min_chunk_size
        self._num_shuffle_parts = int(
            math.ceil(self._db._total_size * 1.1 /
                      (self._num_parts * self._chunk_size << 20)))
        self._chunk_size = int(self._db_size / self._num_shuffle_parts /
                               self._num_parts + 1)

        # init env
        self.reset()

        # run !
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._use_shuffle: self.next_chunk()
                else: self.reset()
Пример #13
0
class DataReader(Process):
    """
    DataReader is deployed to queue encoded str from `LMDB`_.

    It is supported to adaptively partition and shuffle records over all distributed nodes.
    """
    def __init__(self, **kwargs):
        """Construct a ``DataReader``.

        Parameters
        ----------
        source : str
            The path of database.
        shuffle : boolean
            Whether to shuffle the data.
        node_step: boolean
            Whether to split data for multiple parallel nodes.
        num_chunks : int
            The number of chunks to split. Default is ``2048``.
        chunk_size : int
            The size(MB) of each chunk. Default is -1 (Refer ``num_chunks``).

        """
        super(DataReader, self).__init__()
        self._source = GetProperty(kwargs, 'source', '')
        self._use_shuffle = GetProperty(kwargs, 'shuffle', False)
        self._use_step = GetProperty(kwargs, 'node_step', False)
        self._num_chunks = GetProperty(kwargs, 'num_chunks', 2048)
        self._chunk_size = GetProperty(kwargs, 'chunk_size', -1)

        self._num_parts = 1
        self._part_idx = 0
        self._random_seed = config.GetRandomSeed()

        self._cur_idx = 0
        self._cur_chunk_idx = 0

        self.Q_out = None
        self.daemon = True

    def element(self):
        """Get the value of current record.

        Returns
        -------
        str
            The encoded str.

        """
        return self._db.value()

    def redirect(self, target_idx):
        """Redirect to the target position.

        Parameters
        ----------
        target_idx : int
            The key of instance in ``LMDB``.

        Returns
        -------
        None

        Notes
        -----
        The redirection reopens the ``LMDB``.

        You can drop caches by ``echo 3 > /proc/sys/vm/drop_caches``.

        This will disturb getting stuck when ``Database Size`` >> ``RAM Size``.

        """
        self._db.close()
        self._db.open(self._source)
        self._cur_idx = target_idx
        self._db.set(str(self._cur_idx).zfill(self._db_zfill))

    def reset(self):
        """Reset the cursor and environment.

        Returns
        -------
        None

        """
        if self._use_shuffle or self._use_step:
            if self._use_shuffle:
                self._perm = npr.permutation(self._num_shuffle_parts)
            self._cur_chunk_idx = 0
            self._start_idx = int(self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx])
            self._start_idx = int(self._start_idx * self._chunk_size)
            if self._start_idx >= self._db_size: self.next_chunk()
            self._end_idx = self._start_idx + self._chunk_size
            self._end_idx = min(self._db_size, self._end_idx)
        else:
            self._start_idx = 0
            self._end_idx = self._db_size

        self.redirect(self._start_idx)

    def next_record(self):
        """Step the cursor of records.

        Returns
        -------
        None

        """
        self._cur_idx += 1
        self._db.next()

    def next_chunk(self):
        """Step the cursor of shuffling chunks.

        Returns
        -------
        None

        """
        self._cur_chunk_idx += 1
        if self._cur_chunk_idx >= self._num_shuffle_parts: self.reset()
        else:
            self._start_idx = self._part_idx * self._num_shuffle_parts + self._perm[self._cur_chunk_idx]
            self._start_idx = self._start_idx * self._chunk_size
            if self._start_idx >= self._db_size: self.next_chunk()
            else:
                self._end_idx = self._start_idx + self._chunk_size
                self._end_idx = min(self._db_size, self._end_idx)
            self.redirect(self._start_idx)

    def run(self):
        """Start the process.

        Returns
        -------
        None

        """
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._db_size = int(self._db.get('size'))
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)
        # search a optimal chunk size by chunks
        if self._chunk_size == -1:
            max_chunk_size = self._db._total_size / ((self._num_chunks * (1 << 20)))
            min_chunk_size = 1
            while min_chunk_size * 2 < max_chunk_size: min_chunk_size *= 2
            self._chunk_size = min_chunk_size
        self._num_shuffle_parts = int(math.ceil(self._db._total_size * 1.1 /
                                               (self._num_parts * self._chunk_size << 20)))
        self._chunk_size = int(self._db_size / self._num_shuffle_parts / self._num_parts + 1)
        self._perm = npr.permutation(self._num_shuffle_parts)

        # init env
        self.reset()

        # run
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._use_shuffle or self._use_step: self.next_chunk()
                else: self.reset()