Exemplo n.º 1
0
    def run(self):
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._db_size = int(self._db.get('size'))
        self._db_zfill = int(self._db.get('zfill'))
        self._epoch_size = int(self._db_size / self._num_parts + 1)
        # search a optimal chunk size by chunks
        if self._chunk_size == -1:
            max_chunk_size = self._db._total_size / ((self._num_chunks *
                                                      (1 << 20)))
            min_chunk_size = 1
            while min_chunk_size * 2 < max_chunk_size:
                min_chunk_size *= 2
            self._chunk_size = min_chunk_size
        self._num_shuffle_parts = int(
            math.ceil(self._db._total_size * 1.1 /
                      (self._num_parts * self._chunk_size << 20)))
        self._chunk_size = int(self._db_size / self._num_shuffle_parts /
                               self._num_parts + 1)

        # init env
        self.reset()

        # run !
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._use_shuffle: self.next_chunk()
                else: self.reset()
Exemplo n.º 2
0
def make_db(image_path, label_path, database_path):
    if os.path.isfile(label_path) is False:
        raise ValueError('input path is empty or wrong.')
    if os.path.isdir(database_path) is True:
        raise ValueError('the database path is already exist.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(database_path, mode='w')

    total_line = sum(1 for line in open(label_path))
    count = 0
    zfill_flag = '{0:0%d}' % (ZFILL)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 95]

    start_time = time.time()

    with open(label_path, 'r') as input_file:
        for record in input_file:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(image_path, path))
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', wrapper_str(str(count)))
    db.put('zfill', wrapper_str(str(ZFILL)))
    db.commit()
    db.close()

    shutil.copy(label_path, database_path + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
Exemplo n.º 3
0
def make_db(images_list, database_path, pad=0):
    if os.path.isdir(database_path) is True:
        raise ValueError('the database path is already exist.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(database_path, mode='w')

    total_line = len(images_list)
    count = 0
    zfill_flag = '{0:0%d}' % (ZFILL)

    start_time = time.time()

    for record in images_list:
        count += 1
        if count % 10000 == 0:
            now_time = time.time()
            print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                                    now_time - start_time))
            db.commit()

        img = record[0]
        label = record[1]
        if pad > 0:
            pad_img = np.zeros(
                (img.shape[0] + 2 * pad, img.shape[1] + 2 * pad, 3),
                dtype=np.uint8)
            pad_img[pad:pad + img.shape[0], pad:pad + img.shape[1], :] = img
            img = pad_img

        datum = caffe_pb2.Datum()
        datum.height, datum.width, datum.channels = img.shape
        datum.label = int(label)
        datum.encoded = False
        datum.data = img.tostring()
        db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', wrapper_str(str(count)))
    db.put('zfill', wrapper_str(str(ZFILL)))
    db.commit()
    db.close()

    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(database_path + '/data.mdb') / 1000 / 1000)))
Exemplo n.º 4
0
def make_db(args):
    if os.path.isfile(args.list) is False:
        raise ValueError('the path of image list is invalid.')
    if os.path.isdir(args.database) is True:
        raise ValueError('the database is already exist or invalid.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(args.database, mode='w')

    total_line = sum(1 for line in open(args.list))
    count = 0
    zfill_flag = '{0:0%d}' % (args.zfill)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]

    start_time = time.time()

    with open(args.list, 'r') as input_file:
        records = input_file.readlines()
        if args.shuffle:
            import random
            random.shuffle(records)

        for record in records:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(args.root, path))
            if args.resize > 0:
                img = resize_image(img, args.resize)
            if args.pad > 0:
                pad_img = np.zeros((img.shape[0] + 2 * args.pad,
                                    img.shape[1] + 2 * args.pad, 3),
                                   dtype=img.dtype)
                pad_img[args.pad:args.pad + img.shape[0],
                        args.pad:args.pad + img.shape[1], :] = img
                img = pad_img
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', str(count))
    db.put('zfill', str(args.zfill))
    db.commit()
    db.close()

    shutil.copy(args.list, args.database + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
Exemplo n.º 5
0
def make_db(args):
    """Make the sequential database for images.

    Parameters
    ----------
    database : str
        The path of database.
    root : str
        The root folder of raw images.
    list : str
        The path of image list file.
    resize : int
        The size of the shortest edge. Default is ``0`` (Disabled).
    zfill : int
        The number of zeros for encoding keys.
    quality : int
        JPEG quality for encoding, 1-100. Default is ``95``.
    shuffle : boolean
        Whether to randomize the order in list file.

    """
    if os.path.isfile(args.list) is False:
        raise ValueError('the path of image list is invalid.')
    if os.path.isdir(args.database) is True:
        raise ValueError('the database is already exist or invalid.')

    print('start time: ', time.strftime("%a, %d %b %Y %H:%M:%S",
                                        time.gmtime()))

    db = LMDB(max_commit=10000)
    db.open(args.database, mode='w')

    total_line = sum(1 for line in open(args.list))
    count = 0
    zfill_flag = '{0:0%d}' % (args.zfill)

    encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), args.quality]

    start_time = time.time()

    with open(args.list, 'r') as input_file:
        records = input_file.readlines()
        if args.shuffle:
            import random
            random.shuffle(records)

        for record in records:
            count += 1
            if count % 10000 == 0:
                now_time = time.time()
                print('{0} / {1} in {2:.2f} sec'.format(
                    count, total_line, now_time - start_time))
                db.commit()

            record = record.split()
            path = record[0]
            label = record[1]

            img = cv2.imread(os.path.join(args.root, path))
            if args.resize > 0:
                img = resize_image(img, args.resize)
            result, imgencode = cv2.imencode('.jpg', img, encode_param)

            datum = caffe_pb2.Datum()
            datum.height, datum.width, datum.channels = img.shape
            datum.label = int(label)
            datum.encoded = True
            datum.data = imgencode.tostring()
            db.put(zfill_flag.format(count - 1), datum.SerializeToString())

    now_time = time.time()
    print('{0} / {1} in {2:.2f} sec'.format(count, total_line,
                                            now_time - start_time))
    db.put('size', str(count))
    db.put('zfill', str(args.zfill))
    db.commit()
    db.close()

    shutil.copy(args.list, args.database + '/image_list.txt')
    end_time = time.time()
    print('{0} images have been stored in the database.'.format(total_line))
    print('This task finishes within {0:.2f} seconds.'.format(end_time -
                                                              start_time))
    print('The size of database is {0} MB.'.format(
        float(os.path.getsize(args.database + '/data.mdb') / 1000 / 1000)))
Exemplo n.º 6
0
    def run(self):
        """Start the process.

        Returns
        -------
        None

        """
        # fix seed
        npr.seed(self._random_seed)

        # init db
        self._db = LMDB()
        self._db.open(self._source)
        self._zfill = self._db.zfill()
        self._num_entries = self._db.num_entries()
        self._epoch_size = int(self._num_entries / self._num_parts + 1)

        if self._use_shuffle:
            if self._chunk_size == 1:
                # Each chunk has at most 1 record [For Fully Shuffle]
                self._chunk_size, self._num_shuffle_parts = \
                    1, int(self._num_entries / self._num_parts) + 1
            else:
                if self._use_shuffle and self._chunk_size == -1:
                    # Search a optimal chunk size by chunks [For Chunk Shuffle]
                    max_chunk_size = self._db._total_size / (
                        (self._num_chunks * (1 << 20)))
                    min_chunk_size = 1
                    while min_chunk_size * 2 < max_chunk_size:
                        min_chunk_size *= 2
                    self._chunk_size = min_chunk_size
                    self._num_shuffle_parts = int(
                        math.ceil(self._db._total_size * 1.1 /
                                  (self._num_parts * self._chunk_size << 20)))
                    self._chunk_size = int(self._num_entries /
                                           self._num_shuffle_parts /
                                           self._num_parts + 1)
                    limit = (self._num_parts -
                             0.5) * self._num_shuffle_parts * self._chunk_size
                    if self._num_entries <= limit:
                        # Roll back to fully shuffle
                        self._chunk_size, self._num_shuffle_parts = \
                            1, int(self._num_entries / self._num_parts) + 1
        else:
            # Each chunk has at most K records [For Multiple Nodes]
            # Note that if ``shuffle`` and ``multiple_nodes`` are all ``False``,
            # ``chunk_size`` and ``num_shuffle_parts`` are meaningless
            self._chunk_size = int(self._num_entries / self._num_parts) + 1
            self._num_shuffle_parts = 1

        self._perm = np.arange(self._num_shuffle_parts)

        # Init env
        self.reset()

        # Run!
        while True:
            self.Q_out.put(self.element())
            self.next_record()
            if self._cur_idx >= self._end_idx:
                if self._multiple_nodes or \
                    self._use_shuffle:
                    self.next_chunk()
                else:
                    self.reset()
Exemplo n.º 7
0
 def get_db_size(self):
     self._db = LMDB()
     self._db.open(self._source)
     db_size = int(self._db.get('size'))
     self._db.close()
     return db_size