def CreateTestData(uri): writer = py_pert.StringTableWriter() writer.Open(uri, 4) for i in range(10000): d = '%05d' % i writer.Add(d, d) return
def Run(self): reader = py_pert.StringTableShardSetReader() reader.Open(self.GetInput('images').GetUri()) image_ids = [] for i, (k, v) in enumerate(reader): image_ids.append(ParseUint64Key(k)) LOG(INFO, 'creating match groups') match_groups = [] # a list of tuples (primary_id, secondary id list) widgets = [Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=len(image_ids)).start() for i in range(len(image_ids)): primary_id = image_ids[i] secondary_ids = list(image_ids) secondary_ids.remove(primary_id) for secondary_id_chunk in chunks(secondary_ids, self.max_batch_size): match_groups.append((primary_id, secondary_id_chunk)) pbar.update(i) # write out the match plan (must be later sorted by key for future join stage) writer = py_pert.StringTableWriter() options = py_pert.WriterOptions() options.SetUnsorted() LOG(INFO, 'writing match groups') CHECK( writer.Open( self.GetOutput('unsorted_match_batches').GetUri(), 1, options)) metadata = iw_pb2.MatchBatchMetadata() pbar = ProgressBar(widgets=widgets, maxval=len(match_groups)).start() for batch_id, (batch_primary_image, batch_image_ids) in enumerate(match_groups): if len(batch_image_ids) == 0: continue batch_name = py_base.Uint64ToKey(batch_id) metadata = iw_pb2.MatchBatchMetadata() metadata.image_id = batch_primary_image metadata.batch_name = batch_name metadata.is_primary = True writer.Add(py_base.Uint64ToKey(metadata.image_id), metadata.SerializeToString()) for image_id in batch_image_ids: metadata.image_id = image_id metadata.batch_name = batch_name metadata.is_primary = False writer.Add(py_base.Uint64ToKey(metadata.image_id), metadata.SerializeToString()) pbar.update(batch_id) return
def test_string_table(): filename = "local:///home/ubuntu/Desktop/test_string_table" person = test_pb2.Person() person.first_name = 'foo' person.last_name = 'bar' writer = pert.StringTableWriter() writer.Open(filename, 1) writer.Add('key1', person.SerializeToString()) writer.Add('key2', person.SerializeToString()) writer.Close() reader = pert.StringTableReader() reader.Open(filename) for k, v in reader: my_person = test_pb2.Person() my_person.ParseFromString(v) print "key %s value %s" % (k, my_person) return
def Run(self): uri = self.outputs['mg'].GetUri() writer = py_pert.StringTableWriter() writer.Open(uri, py_pert.WriterOptions("memcmp")) return
def PackImagesDirectoryToPert(src_path, output_uri): CHECK(os.path.isdir(src_path), 'expected dir: %s' % src_path) # generate filename cache if it doesn't yet exist images_filename_cache = '%s/filename_cache.txt' % (src_path) # force regen #if os.path.exists(images_filename_cache): # os.remove(images_filename_cache) if not os.path.exists(images_filename_cache): print 'creating filename cache' # get list of files filenames = glob.glob('%s/*.jpg' % src_path) filenames.sort() filenames_file = open(images_filename_cache, 'w') for filename in filenames: filebase = os.path.basename(filename) filenames_file.write('%s\n' % filebase) filenames_file.close() else: print 'using existing filename cache' num_files = NumLinesInFile(images_filename_cache) filenames_file = open(images_filename_cache, 'r') key_size_bytes = 32 block_size_mb = 0.5 num_shards = 10 desired_bloom_error_rate = 0.005 num_files_per_shard = long(float(num_files)/num_shards) num_blocks_per_shard = 4000 index_bytes_per_block = key_size_bytes + 8*3 index_bytes_per_shard = index_bytes_per_block * num_blocks_per_shard sample_pos_keys, sample_neg_keys = GetSampleShardKeys(images_filename_cache, num_shards) num_bits_tuned = py_pert.TuneRequiredBits(sample_pos_keys,sample_neg_keys, desired_bloom_error_rate)*num_shards num_megabytes_tuned = BitsToMegabytes(num_bits_tuned) print 'num_megabytes_tuned: %f' % num_megabytes_tuned num_megabytes_active_blocks = block_size_mb * num_shards num_megabytes_indices = BytesToMegabytes(index_bytes_per_shard*num_shards) num_megabytes_bloom_filters = num_megabytes_tuned print 'num_megabytes_active_blocks: %f' % num_megabytes_active_blocks print 'num_megabytes_indices: %f' % num_megabytes_indices print 'num_megabytes_bloom_filters: %f' % num_megabytes_bloom_filters options = py_pert.WriterOptions() options.SetBlockSize(long(1048576 * block_size_mb)) # bytes per block options.SetSorted('memcmp') options.SetBloomFilterBitsPerShard(num_bits_tuned) writer = py_pert.StringTableWriter() writer.Open(output_uri, num_shards, options) widgets = ['Exporting to %s: ' % output_uri , Percentage(), ' ', Bar(), ' ', ETA(), ' '] pbar = ProgressBar(widgets=widgets, maxval=num_files).start() for i, filename in enumerate(filenames_file): filename = filename.strip() if len(filename) != 36: LOG(WARNING, 'skiping invalid hash format file: %s' % filename) continue # remove the '.jpg' bit hash_key = filename[:-4] data = open(src_path + "/" + filename).read() writer.Add(hash_key, data) pbar.update(i) pbar.finish() writer.Close() return