Exemplo n.º 1
0
def _generate_leveldb(file_path, image_paths, targets, width, height):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\t\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 10000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(image_paths)):
        # Each image is a top level key with a keyname like 00000000011, in increasing
        # order starting from 00000000000.
        key = utils.get_key(idx)

        # Do common normalization that might happen across both testing and validation.
        try:
            image = _preprocess_data(
                _load_numpy_image(image_paths[idx], width, height))
        except:
            print "\t\t\tWarning: Unable to process leveldb image %s" % image_paths[
                idx]
            continue

        # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
        datum = Datum()
        datum.channels = 3  # RGB
        datum.height = height
        datum.width = width
        datum.data = image.tostring()
        datum.label = targets[idx]
        value = datum.SerializeToString()
        wb.put(key, value)

        if (idx + 1) % commit_every == 0:
            wb.write()
            del wb
            wb = db.write_batch()
            end_time = int(round(time.time() * 1000))
            total_time = end_time - start_time
            print "\t\t\tWrote batch, key: %s, time for batch: %d ms" % (
                key, total_time)
            start_time = int(round(time.time() * 1000))

    end_time = int(round(time.time() * 1000))
    total_time = end_time - start_time
    print "\t\t\tWriting final batch, time for batch: %d ms" % total_time
    wb.write()
    db.close()
Exemplo n.º 2
0
    def _generate_leveldb(self, file_path, pairs, target, single_data):
        """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
        print "\tGenerating LevelDB file at %s..." % file_path
        shutil.rmtree(file_path, ignore_errors=True)
        db = plyvel.DB(file_path, create_if_missing=True)
        wb = db.write_batch()
        commit_every = 250000
        start_time = int(round(time.time() * 1000))
        for idx in range(len(pairs)):
            # Each image pair is a top level key with a keyname like 00000000011, in increasing
            # order starting from 00000000000.
            key = siamese_utils.get_key(idx)

            # Actually expand our images now, taking the index reference and turning it into real
            # image pairs; we delay doing this until now for efficiency reasons, as we will probably
            # have more pairs of images than actual computer memory.
            image_1 = single_data[pairs[idx][0]]
            image_2 = single_data[pairs[idx][1]]
            paired_image = np.concatenate([image_1, image_2])

            # Do things like mean normalize, etc. that happen across both testing and validation.
            paired_image = self._preprocess_data(paired_image)

            # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
            datum = Datum()
            # One channel for each image in the pair.
            datum.channels = 2  # One channel for each image in the pair.
            datum.height = constants.HEIGHT
            datum.width = constants.WIDTH
            datum.data = paired_image.tostring()
            datum.label = target[idx]
            value = datum.SerializeToString()
            wb.put(key, value)

            if (idx + 1) % commit_every == 0:
                wb.write()
                del wb
                wb = db.write_batch()
                end_time = int(round(time.time() * 1000))
                total_time = end_time - start_time
                print "Wrote batch, key: %s, time for batch: %d ms" % (
                    key, total_time)
                start_time = int(round(time.time() * 1000))

        wb.write()
        db.close()
Exemplo n.º 3
0
def make_data(param):
    for phase in ['train', 'valid', 'test']:
        print 'Starting %s' % phase
        db_name = './examples/language_model/lm_%s_db' % phase
        subprocess.call(['rm', '-rf', db_name])
        env = lmdb.open(db_name, map_size=2147483648 * 8)

        def vocab_transform(target_input):
            def t_foo(x):
                return x if x < param['unknown_symbol'] else param[
                    'unknown_symbol']

            target_line = [
                t_foo(int(x))
                for x in target_input.split(' ')[:param['maximum_length']]
            ]

            target_line = target_line[:param['maximum_length']] + \
                          [param['zero_symbol']] * (param['maximum_length'] - len(target_line[:param['maximum_length']]))
            assert len(target_line) == param['maximum_length']
            return target_line

        allX = []
        with open('./data/language_model/%s_indices.txt' % phase, 'r') as f1:
            for en in f1.readlines():
                allX.append(vocab_transform(en))

        print 'Writing %s sentences' % len(allX)

        with env.begin(write=True) as txn:
            for i, target_line in enumerate(allX):
                datum = Datum()
                datum.channels = 2 * param['maximum_length']
                datum.width = 1
                datum.height = 1
                for j in range(param['maximum_length']):
                    if j == 0:
                        datum.float_data.append(param['start_symbol'])
                    else:
                        datum.float_data.append(target_line[j - 1])
                for j in range(param['maximum_length']):
                    datum.float_data.append(target_line[j])
                key = str(i)
                txn.put(key, datum.SerializeToString())
Exemplo n.º 4
0
def _generate_leveldb(self, file_path, image, target, single_data):
    """
    Caffe uses the LevelDB format to efficiently load its training and validation data; this method
    writes paired out faces in an efficient way into this format.
    """
    print "\tGenerating LevelDB file at %s..." % file_path
    shutil.rmtree(file_path, ignore_errors=True)
    db = plyvel.DB(file_path, create_if_missing=True)
    wb = db.write_batch()
    commit_every = 250000
    start_time = int(round(time.time() * 1000))
    for idx in range(len(pairs)):
      # Each image is a top level key with a keyname like 00000000011, in increasing
      # order starting from 00000000000.
      key = utils.get_key(idx)

      # Do things like mean normalize, etc. that happen across both testing and validation.
      paired_image = self._preprocess_data(paired_image)

      # Each entry in the leveldb is a Caffe protobuffer "Datum" object containing details.
      datum = Datum()
      # TODO(neuberg): Confirm that this is the correct way to setup RGB images for
      # Caffe for our dataset.
      datum.channels = 3
      datum.height = constants.HEIGHT
      datum.width = constants.WIDTH
      datum.data = image.tostring()
      datum.label = target[idx]
      value = datum.SerializeToString()
      wb.put(key, value)

      if (idx + 1) % commit_every == 0:
        wb.write()
        del wb
        wb = db.write_batch()
        end_time = int(round(time.time() * 1000))
        total_time = end_time - start_time
        print "Wrote batch, key: %s, time for batch: %d ms" % (key, total_time)
        start_time = int(round(time.time() * 1000))

    wb.write()
    db.close()
Exemplo n.º 5
0
            datum.height, datum.width = submap.shape
            datum.channels = 1

        datum.float_data.extend(list(submap.flatten()))
        if mean_blob is None:
            mean_blob = BlobProto()
            mean_blob.height = datum.height
            mean_blob.width = datum.width
            mean_blob.channels = datum.channels
            mean_blob.num = 1
            img_mean = submap
        else:
            img_mean += submap

        datum.label = 0
        if not txn.put(key, datum.SerializeToString(), dupdata=False):
            print 'Key {}: failed.'.format(key)

        n += 1
        if n % 1000 == 0:
            txn.commit()
            print "Proccessed {} samples.".format(n)
            txn = db.begin(write=True)

    # commit last batch
    if n % 1000 != 0:
        txn.commit()
        print "Proccessed {} samples.".format(n)
    img_mean /= len(maps)
    print "Totally proccessed {} samples.".format(n)