def LoadImageRegionGraph(uri): irg = iw_pb2.ImageRegionGraph() reader = py_pert.StringTableReader() CHECK(reader.Open(uri)) ok, tmp = reader.GetMetadata("num_edges") num_edges = long(tmp) ok, tmp = reader.GetMetadata("num_vertices") CHECK(ok, "this doesn't appear to be a irg uri: %s" % (uri)) num_vertices = long(tmp) CHECK_EQ(reader.Entries(), num_edges + num_vertices) progress = iwutil.MakeProgressBar(num_edges) # load edges for i in range(num_edges): ok, key, value = reader.Next() CHECK(ok) CHECK_EQ(key[0], 'e') irg.edge.add().ParseFromString(value) progress.update(i) # load vertices progress = iwutil.MakeProgressBar(num_vertices) for i in range(num_vertices): ok, key, value = reader.Next() CHECK(ok) CHECK_EQ(key[0], 'v') irg.vertex.add().ParseFromString(value) progress.update(i) return irg
def __ImportImageArchive(self, extract_dir, dataset_root): image_filenames = [] print 'Searching for image files...' for root, dirnames, filenames in os.walk(extract_dir): for filename in filenames: if fnmatch.fnmatch(filename, '*.jpg') or fnmatch.fnmatch( filename, '*.jpeg'): image_filenames.append(os.path.join(root, filename)) pert_uri = 'local://%s/photoid_to_image.pert' % (dataset_root) if len(image_filenames) <= 2: return False fingerprinted_path = '%s/fingerprinted/' % (extract_dir) os.mkdir(fingerprinted_path) print 'Fingerprinting image files...' progress = util.MakeProgressBar(len(image_filenames)) # rename all files according to fingerprint for i, filename in enumerate(image_filenames): data = open(filename).read() fp = py_base.FingerprintString(data) dst = '%s/%064d.jpg' % ( fingerprinted_path, fp ) # ensure lexical sort = numeric sort = key sort os.rename(filename, dst) progress.update(i) filenames = glob.glob('%s/*.jpg' % fingerprinted_path) filenames.sort() output_uri = 'local://%s/photoid_to_image.pert' % (dataset_root) # write to pert in sorted order print 'Generating image PERT file...' writer = py_pert.StringTableWriter() CHECK(writer.Open(output_uri, 1)) progress = util.MakeProgressBar(len(filenames)) for i, filename in enumerate(filenames): data = open(filename).read() key = py_base.Uint64ToKey(py_base.FingerprintString(data)) try: im = Image.open(StringIO.StringIO(data)) except IOError as e: LOG(INFO, 'Error opening %s - %s' % (filename, e)) continue width, height = im.size jpeg = iw_pb2.JpegImage() jpeg.data = data jpeg.width = width jpeg.height = height CHECK(jpeg.IsInitialized()) writer.Add(key, jpeg.SerializeToString()) progress.update(i) writer.Close() return True
def ExportGraphJson(self): base_path = '%s/graph/' % self.output_path if os.path.exists(base_path): return LOG(INFO, 'exporting graph json...') os.mkdir(base_path) g = self.image_graph.graph progress = iwutil.MakeProgressBar(g.vcount()) for i, vertex in enumerate(g.vs): vertex_index = vertex.index image_id = vertex['image_id'] data = {'image_id': JSPad(image_id)} # need str because js can't handle 64 bit int neighbors = [JSPad(n['image_id']) for n in vertex.neighbors()] weights = [g.es[g.get_eid(vertex_index, n.index)]['weight_as_similarity'] for n in vertex.neighbors()] data['neighbors'] = neighbors tide_label = None try: tide_label = self.image_graph.tide.imageid_to_label[image_id] except: pass if tide_label: data['tide_object_id'] = tide_label.object_id data['tide_object_label'] = tide_label.label data['weights'] = weights filename = '%s/%s.json' % (base_path, JSPad(image_id)) open(filename, 'w').write(json.dumps(data)) progress.update(i) return
def ExportMatchesJson(self): base_path = '%s/matches/' % (self.output_path) if os.path.exists(base_path): return LOG(INFO, 'exporting match json...') os.mkdir(base_path) reader = py_pert.StringTableReader() CHECK(reader.Open(self.matches_uri)) progress = iwutil.MakeProgressBar(reader.Entries()) match_result = iw_pb2.GeometricMatchResult() for i, (k,v) in enumerate(reader): image_a_id, image_b_id = iwutil.ParseUint64KeyPair(k) match_result.ParseFromString(v) if not match_result.matches: continue filename = '%s/%s_%s.json' % (base_path, JSPad(image_a_id), JSPad(image_b_id)) f = open(filename, 'w') data = {} CHECK_LT(image_a_id, image_b_id) data['image_a'] = JSPad(image_a_id) # use string because js can't handle 64bit int data['image_b'] = JSPad(image_b_id) # use string because js can't handle 64bit int data['image_a_size'] = self.imageid_to_size[image_a_id] data['image_b_size'] = self.imageid_to_size[image_b_id] matches = [] for match in match_result.matches: for c in match.correspondences: match_info = [c.a.pos.x, c.a.pos.y, c.a.radius, c.b.pos.x, c.b.pos.y, c.b.radius] matches.append(match_info) data['matches'] = matches f.write(json.dumps(data)) progress.update(i) return
def __init__(self, param_data_uri): self.num_configs = None self.rows = [] self.configs_per_page = 10 self.num_pages = None self.exclude_rows = [ 'corresondence_filter', 'type', 'dataset_name', 'visual_vocabulary_uri', 'max_image_replication_factor', 'max_match_batch_size', 'max_vertex_degree', 'dataset name' ] reader = py_pert.StringTableReader() CHECK(reader.Open(param_data_uri)) configs = [] progress = iwutil.MakeProgressBar(reader.Entries()) result = lpbench_pb2.ConfigurationResult() for i, (k, v) in enumerate(reader): result.ParseFromString(v) config = itergraph_pb2.IterGraphParams() config.CopyFrom(result.config.iter_graph) configs.append(config) progress.update(i) #if i > 0: # break self.__BuildTable(configs) return
def __init__(self, image_graph_uri, tide_uri): self.tide = None if tide_uri: #self.tide = tide.GetCachedTideDataset(tide_uri) self.tide = tide.TideDataset(tide_uri) ok, image_graph_data = py_imagegraph.LoadImageGraph(image_graph_uri) CHECK(ok) num_images = len(image_graph_data.vertices) # load proto data into an igraph graph object LOG(INFO, 'loading verts') g = igraph.Graph(num_images, directed=False) self.vertexid_to_imageid = [ v.image_id for v in image_graph_data.vertices ] self.imageid_to_vertexid = {} for vertex_id, image_id in enumerate(self.vertexid_to_imageid): self.imageid_to_vertexid[image_id] = vertex_id for node_id, node in enumerate(image_graph_data.vertices): image_id = self.vertexid_to_imageid[node_id] node = g.vs[node_id] node['image_id'] = image_id if self.tide: label_data = self.tide.GetLabel(image_id) if label_data != None: node['object_id'] = label_data.object_id node['label'] = label_data.label num_edges = len(image_graph_data.edges) progress = iwutil.MakeProgressBar(num_edges) prev_edges = set() LOG(INFO, 'loading edges') edge_list = [] edge_weight_values = [] edge_nfa_values = [] for edge_id, edge in enumerate(image_graph_data.edges): if (edge.src, edge.dst) in prev_edges: continue if edge.weight < 0.15: continue new_edge = (edge.src, edge.dst) prev_edges.add(new_edge) edge_list.append(new_edge) edge_weight_values.append(edge.weight) edge_nfa_values.append(edge.nfa) progress.update(edge_id) LOG(INFO, 'adding edges') g.add_edges(edge_list) LOG(INFO, 'adding edge properties') g.es["weight_as_similarity"] = edge_weight_values g.es["nfa"] = edge_nfa_values self.graph = g return
def ComputeCbirPerformanceStats(tide_uri, query_results_uri): tide = TideDataset(tide_uri) reader = py_pert.StringTableShardSetReader() CHECK(reader.Open(query_results_uri)) results = cbir_pb2.QueryResults() imageid_to_ap = {} progress = iwutil.MakeProgressBar(reader.Entries()) for i, (k, v) in enumerate(reader): progress.update(i) query_image_id = iwutil.KeyToUint64(k) query_label = tide.GetLabel(query_image_id) if query_label == None or query_label.object_id == None: LOG(FATAL, 'unknown class for image: %d' % (query_image_id)) query_object_id = query_label.object_id results.ParseFromString(v) query_object_num_pos = tide.NumPosImages(query_object_id) prev_score = float("inf") sum_precision = 0.0 num_correct = 0.0 rank = 0.0 non_junk_results = [] num_junk = 0 for result in results.entries: label = tide.GetLabel(result.image_id) if label and label.object_id == query_object_id and ( label.label == 'neg' or label.label == 'none'): num_junk += 1 else: non_junk_results.append(result) for result in non_junk_results: rank += 1 CHECK_LE( result.score, prev_score) ## check invariant: ordered decreasing by score prev_score = result.score result_label = tide.GetLabel(result.image_id) if not result_label or result_label.object_id == None: LOG(FATAL, 'unknown class: %d' % (result.image_id)) if result_label.object_id == query_object_id: num_correct += 1.0 precision = num_correct / rank sum_precision += precision average_precision = sum_precision / query_object_num_pos imageid_to_ap[query_image_id] = average_precision # compute mean average precision (across each object classes and across all classes) values = [v for v in imageid_to_ap.itervalues()] mean_average_precision = np.mean(values) object_mean_average_precision = {} for tide_object in tide.objectid_to_object.itervalues(): object_mean_average_precision[tide_object.name] = np.mean( [imageid_to_ap[i] for i in tide_object.GetImageIds()]) return mean_average_precision, object_mean_average_precision
def GetImageSizes(uri): imageid_to_size = {} reader = py_pert.StringTableShardSetReader() CHECK(reader.Open(uri)) jpeg_image = iw_pb2.JpegImage() progress = iwutil.MakeProgressBar(reader.Entries()) for i, (k, v) in enumerate(reader): image_id = py_base.KeyToUint64(k) jpeg_image.ParseFromString(v) imageid_to_size[image_id] = (jpeg_image.width, jpeg_image.height) progress.update(i) return imageid_to_size
def GetUriSplitInfo(uri): #TODO(heathkh): Can you make this faster by collecting max block size and entries without fully opening the reader? uris = py_pert.GetShardUris(uri) num_entries = 0 max_block_size = 0 progress = iwutil.MakeProgressBar(len(uris)) for i, uri in enumerate(uris): reader = py_pert.StringTableShardReader() reader.Open(uri) max_block_size = max(max_block_size, reader.MaxBlockSize()) num_entries += reader.Entries() progress.update(i) return num_entries, max_block_size
def CreateIndex(features_uri, index_uri): reader = py_pert.StringTableShardSetReader() CHECK(reader.Open(features_uri)) features = iw_pb2.ImageFeatures() include_keypoints = True index = py_cbir.FeatureIndex(include_keypoints) progress = iwutil.MakeProgressBar(reader.Entries()) for i, (k,v) in enumerate(reader): image_id = iwutil.KeyToUint64(k) features.ParseFromString(v) index.Add(image_id, features) progress.update(i) progress.finish() index.Build() return index.Save(index_uri)
def main(): crop_fraction = 0.05 base_uri = 'local://home/ubuntu/Desktop/vol-7f209e0c/itergraph/tide_v08_distractors/' orig_image_uri = '%s/photoid_to_image.pert' % (base_uri) tide_uri = '%s/objectid_to_object.pert' % (base_uri) new_tide_uri = '%s/cropped_objectid_to_object.pert' % (base_uri) #cropped_image_uri = '%s/cropped_scaled_photoid_to_image.pert' % (base_uri) orig_sizes_dict = GetCachedImageSizes(orig_image_uri) reader = py_pert.StringTableReader() writer = py_pert.ProtoTableWriter() tide_object = tide_pb2.Object() CHECK(writer.Open(tide_object, new_tide_uri, 1)) CHECK(reader.Open(tide_uri)) progress = iwutil.MakeProgressBar(reader.Entries()) for i, (k, v) in enumerate(reader): tide_object.ParseFromString(v) CHECK(tide_object.IsInitialized()) # adjust the bb of all the photos for photo in tide_object.photos: CHECK(photo.id in orig_sizes_dict) width, height = orig_sizes_dict[photo.id] try: crop_rect = crop.CropRect(width, height, crop_fraction) for region in photo.regions: bb1_x, bb1_y, bb1_w, bb1_h = region.x1, region.y1, region.x2 - region.x1, region.y2 - region.y1 bb2_x, bb2_y, bb2_w, bb2_h = crop_rect.ApplyCropToRect( bb1_x, bb1_y, bb1_w, bb1_h) region.x1 = bb2_x region.y1 = bb2_y region.x2 = bb2_x + bb2_w region.y2 = bb2_y + bb2_h except ValueError: print 'crop failed, not adjusting bb' # write adjusted proto to output writer.Add(k, tide_object.SerializeToString()) progress.update(i) return
def main(): dataset_root_uri = 'local://media/vol-0449ca74/itergraph/' input_dataset_name = 'tide_v08' output_dataset_name = 'tide_v08_distractors' distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert' input_tide_uri = '%s/%s/objectid_to_object.pert' % (dataset_root_uri, input_dataset_name) output_tide_uri = '%s/%s/objectid_to_object.pert' % (dataset_root_uri, output_dataset_name) input_images_uri = '%s/%s/photoid_to_image.pert' % (dataset_root_uri, input_dataset_name) output_images_uri = '%s/%s/photoid_to_image.pert' % (dataset_root_uri, output_dataset_name) #merge images CHECK(py_pert.MergeTables( [input_images_uri, distractor_images_uri], output_images_uri)) objectid_to_object = LoadTide(input_tide_uri) distractors = tide.ImageSequenceReader(distractor_images_uri) num_distractors = distractors.GetNumImages() num_objects = len(objectid_to_object) num_distractors_per_object = int(float(num_distractors)/num_objects) # for each object progress = iwutil.MakeProgressBar(num_distractors) count = 0 for object_id, object in objectid_to_object.iteritems(): # augment object with distractor images for i in range(num_distractors_per_object): image_id, jpeg = distractors.GetNextImage() new_photo = object.photos.add() new_photo.id = image_id new_photo.label = tide_pb2.NEGATIVE count += 1 progress.update(count) # save tide pert in output directory SaveTide(objectid_to_object, output_tide_uri) return
def main(): dataset_name = 'tide_v08' sizes = {} sizes['thumbnail'] = 100 * 100 sizes['small'] = 640 * 480 reset_bucket = False #dataset_base_uri = 'local://home/ubuntu/Desktop/vol-0449ca74/itergraph/%s/' % (dataset_name) #images_uri = '%s/cropped_scaled_photoid_to_image.pert' % (dataset_base_uri) images_uri = 'local://home/ubuntu/Desktop/vol-0449ca74/itergraph/tide_v14/cropped_scaled_photoid_to_image_randomaccess.pert' bucket_name = 'tide_image_cache' s3 = boto.connect_s3() bucket = s3.create_bucket(bucket_name) if reset_bucket: LOG(INFO, 'listing contents of bucket...') all_keys = [key.name for key in bucket.list()] LOG(INFO, 'deleting contents of bucket...') bucket.delete_keys(all_keys) s3.delete_bucket(bucket_name) bucket = s3.create_bucket(bucket_name) bucket.set_acl('public-read') reader = py_pert.StringTableReader() CHECK(reader.Open(images_uri)) progress = iwutil.MakeProgressBar(reader.Entries()) num_workers = 200 max_queue_size = 200 job_queue = JobQueue(num_workers, max_queue_size) for i, (key, value) in enumerate(reader): image_id = py_base.KeyToUint64(key) jpeg_image = iw_pb2.JpegImage() jpeg_image.ParseFromString(value) job_queue.AddJob( ResizeAndUploadImageJob(bucket, sizes, image_id, jpeg_image.data)) progress.update(i) job_queue.WaitForJobsDone() return
def ExportImages(self): image_size_cache_filename = '%s/images/size_cache.pickle' % self.output_path if os.path.exists(image_size_cache_filename): self.imageid_to_size = iwutil.LoadObject(image_size_cache_filename) return base_path = '%s/images/' % (self.output_path) os.mkdir(base_path) LOG(INFO, 'exporting images...') reader = py_pert.StringTableReader() CHECK(reader.Open(self.images_uri)) jpeg_image = iw_pb2.JpegImage() progress = iwutil.MakeProgressBar(reader.Entries()) for i, (k,v) in enumerate(reader): image_id = py_base.KeyToUint64(k) jpeg_image.ParseFromString(v) filename = '%s/%s.jpg' % (base_path, JSPad(image_id)) f = open(filename, 'wb') f.write(jpeg_image.data) f.close() self.imageid_to_size[image_id] = (jpeg_image.width, jpeg_image.height) progress.update(i) iwutil.SaveObject(self.imageid_to_size, image_size_cache_filename) return
def Run(self): bow_uri = self.GetInput('bow').GetUri() reader = py_pert.StringTableReader() CHECK(reader.Open(bow_uri)) visual_vocab_size = self.cbir_bow_params.visual_vocab_size num_docs = reader.Entries() index = None if self.cbir_bow_params.implementation == 'inria': index = py_inria.InriaIndex() elif self.cbir_bow_params.implementation == 'ctis': index = py_ctis.CtisIndex() index.StartCreate(visual_vocab_size, num_docs) else: LOG(FATAL, 'unexpected') #vv_uri = self.GetInput('visual_vocab').GetUri() temp_ivf_filepath = tempfile.mkdtemp() bag_of_words = bow_pb2.BagOfWords() progress = iwutil.MakeProgressBar(reader.Entries()) for i, (key, value) in enumerate(reader): image_id = iwutil.KeyToUint64(key) bag_of_words.ParseFromString(value) index.Add(image_id, bag_of_words) progress.update(i) index.Save(temp_ivf_filepath) py_pert.Remove(self.index_base_uri) mr.CopyUri('local://' + temp_ivf_filepath , self.index_base_uri) CHECK(py_pert.Exists(self.index_base_uri + '/index.ivf')) CHECK(py_pert.Exists(self.index_base_uri + '/index.ivfids')) shutil.rmtree(temp_ivf_filepath, ignore_errors=True) return True
def main(): # images_uri = 'local://media/vol-0449ca74/itergraph/tide_v14/cropped_scaled_photoid_to_image_randomaccess.pert' # tide_uri = 'local://media/vol-0449ca74/itergraph/tide_v14/objectid_to_object.pert' # distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert' # output_base_uri = 'local://media/vol-0449ca74/itergraph/tide_v14_mixed_v2/' # images_uri = 'local://media/vol-0449ca74/itergraph/tide_v16/photoid_to_image.pert' # tide_uri = 'local://media/vol-0449ca74/itergraph/tide_v16/objectid_to_object.pert' # distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert' # output_base_uri = 'local://media/vol-0449ca74/itergraph/tide_v16_mixed/' images_uri = 'local://media/vol-0449ca74/itergraph/tide_v18/photoid_to_image.pert' tide_uri = 'local://media/vol-0449ca74/itergraph/tide_v18/objectid_to_object.pert' distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert' output_base_uri = 'local://media/vol-0449ca74/itergraph/tide_v18_mixed/' output_tide_uri = '%s/objectid_to_object.pert' % (output_base_uri) output_images_uri = '%s/photoid_to_image.pert' % (output_base_uri) image_loader = visutil.BatchImageLoader(images_uri) tide_objects = OpenTideDataset(tide_uri) distractor_image_loader = DistractorImageLoader(distractor_images_uri) CHECK_EQ(len(tide_objects), 2) object_a = tide_objects[0] object_b = tide_objects[1] new_object_a = InitNewObject(object_a) new_object_b = InitNewObject(object_b) a_none_image_ids = [photo.id for photo in object_a.photos if photo.label == tide_pb2.NONE ] b_none_image_ids = [photo.id for photo in object_b.photos if photo.label == tide_pb2.NONE ] mixed_aux_images = {} for i, (imageid_a, imageid_b) in enumerate(zip(a_none_image_ids, b_none_image_ids)): mixed_aux_images[imageid_a] = (imageid_a, imageid_b) mixed_image_ids = mixed_aux_images.keys() n = int(len(mixed_image_ids)/2.0) InitNoneLabels(new_object_a, mixed_image_ids[0:n]) InitNoneLabels(new_object_b, mixed_image_ids[n:-1]) new_objects = [new_object_a, new_object_b] image_ids = [] for obj in new_objects: for photo in obj.photos: image_ids.append(photo.id) image_ids.sort() # write new tide pert tide_writer = py_pert.ProtoTableWriter() tide_writer.Open(tide_pb2.Object(), output_tide_uri, 1) for obj in new_objects: tide_writer.Add(iwutil.Uint64ToKey(obj.id), obj.SerializeToString()) tide_writer.Close() # write new image pert try: image_writer = py_pert.ProtoTableWriter() image_writer.Open(iw_pb2.JpegImage(), output_images_uri, 1) used_image_ids = set() progress = iwutil.MakeProgressBar(len(image_ids)) for i, image_id in enumerate(image_ids): jpeg = None if image_id in mixed_aux_images: imageid_a, imageid_b = mixed_aux_images[image_id] jpeg_a = image_loader.GetImage(imageid_a) jpeg_b = image_loader.GetImage(imageid_b) if jpeg_a == None or jpeg_b == None: LOG(INFO, 'skipping missing jpeg') continue jpeg = CreateMixedJpeg(jpeg_a, jpeg_b) else: distractor = None while True: distractor = distractor_image_loader.GetNextImage() if distractor.width > distractor.height: break CHECK(distractor) jpeg = CreateMixedJpeg(image_loader.GetImage(image_id), distractor) CHECK(image_id not in used_image_ids) CHECK(jpeg) image_writer.Add(iwutil.Uint64ToKey(image_id), jpeg.SerializeToString()) used_image_ids.add(image_id) progress.update(i) image_writer.Close() except: pass return
def Run(self): LOG( INFO, 'waiting to let running processes give up memory... I need a lot and may not get enough if we rush things...' ) time.sleep(30) itergraph_state = LoadObjectFromUri( self.GetInput('prev_state').GetUri()) reader = py_pert.StringTableReader() CHECK(reader.Open(self.GetInput('candidates').GetUri())) self.match_groups = {} num_selected_candidates = 0 pbar = iwutil.MakeProgressBar(self.max_candidates_per_phase) num_edges_skipped_max_degree_constraint = 0 num_edges_skipped_max_replication_constraint = 0 prev_score = -float('inf') for ordering_key, candidate_pair_data in reader: image_a_id, image_b_id = iwutil.ParseUint64KeyPair( candidate_pair_data) if itergraph_state.PreviouslyAttempted(image_a_id, image_b_id): #print 'skipping previous attempted edge' continue # check precondition... candidates pert is sorted (increasing by rank or by negative cbir score) score = iwutil.KeyToDouble(ordering_key) CHECK_GE(score, prev_score) prev_score = score if image_a_id not in self.match_groups: self.match_groups[image_a_id] = [] match_group_size = len(self.match_groups[image_a_id]) if match_group_size < self.max_batch_size: # test vertex degree condition degree_a = itergraph_state.GetDegree(image_a_id) degree_b = itergraph_state.GetDegree(image_b_id) # version 1: skip candidate edge if either of the vertices has many edges #if degree_a < self.max_vertex_degree and degree_b < self.max_vertex_degree: # version 2: skip candidate edge only if both of the vertices have many edges if degree_a < self.max_vertex_degree or degree_b < self.max_vertex_degree: # test max replication condition num_replications = self._GetNumReplications(image_b_id) if num_replications < self.max_replication_factor: self._IncrementReplications(image_b_id) self.match_groups[image_a_id].append(image_b_id) num_selected_candidates += 1 pbar.update(num_selected_candidates) else: num_edges_skipped_max_replication_constraint += 1 else: num_edges_skipped_max_degree_constraint += 1 if num_selected_candidates >= self.max_candidates_per_phase: break pbar.finish() print '' print '' print 'num_edges_skipped_max_replication_constraint: %d' % ( num_edges_skipped_max_replication_constraint) print 'num_edges_skipped_max_degree_constraint: %d' % ( num_edges_skipped_max_degree_constraint) print '' print '' # write out the match plan (must be sorted by key for future join stage) metadata_entries = [] for batch_id, (batch_primary_image, batch_image_ids) in enumerate( self.match_groups.iteritems()): if len(batch_image_ids) == 0: continue batch_name = iwutil.Uint64ToKey(batch_id) CHECK(batch_name) CHECK(len(batch_name)) match_batch_metadata = iw_pb2.MatchBatchMetadata() match_batch_metadata.image_id = batch_primary_image match_batch_metadata.batch_name = batch_name match_batch_metadata.is_primary = True metadata_entries.append(match_batch_metadata) for image_id in batch_image_ids: next_metadata = iw_pb2.MatchBatchMetadata() next_metadata.image_id = image_id next_metadata.batch_name = batch_name next_metadata.is_primary = False metadata_entries.append(next_metadata) # image_id will be the key of output, so need to sort by image_id metadata_entries.sort(key=lambda m: m.image_id) match_batches_uri = self.GetOutput('sorted_match_batches').GetUri() # TODO(heathkh): "closing" doesn't flush to disk... this is a bug! # match_plan_writer = py_pert.ProtoTableWriter() # num_shards_features = py_pert.GetNumShards(self.features.GetUri()) # CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), match_batches_uri, num_shards_features)) # for metadata in metadata_entries: # CHECK(metadata.IsInitialized()) # key = iwutil.Uint64ToKey(metadata.image_id) # CHECK(match_plan_writer.Add(key, metadata.SerializeToString())) # match_plan_writer.Close() # TODO(kheath): Work around for above bug is to run a MR stage to reshard tmp_match_batches_uri = self.GetOutput( 'sorted_match_batches').GetUri() + '_to_be_sharded' match_plan_writer = py_pert.ProtoTableWriter() num_shards_features = py_pert.GetNumShards(self.features.GetUri()) CHECK( match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), tmp_match_batches_uri, 1)) for metadata in metadata_entries: CHECK(metadata.IsInitialized()) CHECK( match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id), metadata.SerializeToString())) match_plan_writer.Close() # manually reshard pertedit_bin = 'pertedit' cmd = '%s --input %s --output %s --new_block_size_mb=10 --num_output_shards=%d' % ( pertedit_bin, tmp_match_batches_uri, match_batches_uri, num_shards_features) print cmd CHECK_EQ(ExecuteCmd(cmd), 0) CHECK(py_pert.Exists(match_batches_uri)) ok, fp = py_pert.GetShardSetFingerprint(match_batches_uri) CHECK(ok) CHECK(len(fp), 32) CHECK_NE(fp, 'd41d8cd98f00b204e9800998ecf8427e', 'invalid hash of empty string') return
def Run(self): itergraph_state = LoadObjectFromUri( self.GetInput('prev_state').GetUri()) if True: # hack to put this block in it's own scope to force release of memory resources reader = py_pert.StringTableReader() CHECK(reader.Open(self.GetInput('match_results').GetUri())) match_result = iw_pb2.GeometricMatchResult() num_entries = reader.Entries() if num_entries: pbar = iwutil.MakeProgressBar(num_entries) for i, (k, v) in enumerate(reader): pbar.update(i) match_result.ParseFromString(v) success = False for match in match_result.matches: if match.nfa < -20: success = True if success: itergraph_state.AddSuccesfulEdge( match_result.image_a_id, match_result.image_b_id, match_result.properties.score, self.phase) else: itergraph_state.AddFailedEdge(match_result.image_a_id, match_result.image_b_id) print 'edges: %d' % (len(itergraph_state.edges)) SaveObjectToUri(itergraph_state, self.GetOutput('itergraph_state').GetUri()) return #def EvalImageGraph(itergraph_state_uri, tide_uri): # # compute edge stats # num_within_cluster, num_cross_cluster = CountCorrectIncorrectEdges(itergraph_state_uri, tide_uri) # # # compute label prop performance # itergraph_state = LoadObjectFromUri(itergraph_state_uri) # eval_graph_uri = 'local://tmp/eval3_graph.pert' # itergraph_state.SaveAsEval3Graph(eval_graph_uri) # # num_training_images = 100 # num_trials = 4 # evaluation = py_eval3.EvaluationRunner(eval_graph_uri, tide_uri, num_training_images, num_trials) # ok, result = evaluation.Run() # CHECK(ok) # # return num_within_cluster, num_cross_cluster, result # # #class EvalImageGraphFlow(core.Flow): # def __init__(self, base_uri, itergraph_state, tide): # super(EvalImageGraphFlow,self).__init__() # self.base_uri = base_uri # self.AddInput('itergraph_state', itergraph_state) # self.AddInput('tide', tide) # # self.AddOutput('eval', core.FileResource(self, '%s/eval.txt' % (base_uri))) # # return # # # def Run(self): # # compute edge stats # itergraph_state_uri = self.GetInput('itergraph_state').GetUri() # tide_uri = self.GetInput('tide').GetUri() # # num_within_cluster, num_cross_cluster, result = EvalImageGraph(itergraph_state_uri, tide_uri) # # lines = [] # lines.append('num_within_cluster: %d' % (num_within_cluster)) # lines.append('num_cross_cluster: %d' % (num_cross_cluster)) # lines.append('fraction within cluster: %f' % (float(num_within_cluster)/ (num_within_cluster + num_cross_cluster))) # lines.append(str(result)) # report = '\n'.join(lines) # print report # # path = mr.UriToNfsPath(self.GetOutput('eval').GetUri()) # f = open(path, 'w') # f.write(report) # # return # #class BasicMatchBatchPlanningFlow(core.Flow): # def __init__(self, base_uri, candidates, max_batch_size, max_replication_factor, num_shards_features): # super(BasicMatchBatchPlanningFlow,self).__init__() # # self.AddInput('candidates', candidates) # self.AddOutput('sorted_match_batches', core.PertResource(self, '%s/sorted_match_batches.pert' % (base_uri))) # # self.max_batch_size = max_batch_size # self.max_replication_factor = max_replication_factor # # self.num_shards_features = num_shards_features # self.match_groups = {} # self.num_replications = {} # return # # def _GetNumReplications(self, image_id): # replications = 0 # if image_id in self.num_replications: # replications = self.num_replications[image_id] # return replications # # def _IncrementReplications(self, image_id): # if image_id not in self.num_replications: # self.num_replications[image_id] = 0 # self.num_replications[image_id] += 1 # return # # def Run(self): # reader = py_pert.StringTableReader() # reader.Open(self.GetInput('candidates').GetUri()) # # self.match_groups = {} # num_selected_candidates = 0 # prev_score = -1e6 # # widgets = [Percentage(), ' ', Bar(), ' ', ETA()] # pbar = ProgressBar(widgets=widgets, maxval=reader.Entries()).start() # # num_edges_skipped_max_replication_constraint = 0 # for i, (k, v) in enumerate(reader): # image_a_id, image_b_id = ParseUint64KeyPair(v) # # # check precondition... pert is sorted by scores # score = iwutil.KeyToDouble(k) # CHECK_GE(score, prev_score) # prev_score = score # # if image_a_id not in self.match_groups: # self.match_groups[image_a_id] = [] # # match_group_size = len(self.match_groups[image_a_id]) # # if match_group_size < self.max_batch_size: # # test max replication condition # num_replications = self._GetNumReplications(image_b_id) # if num_replications < self.max_replication_factor: # self._IncrementReplications(image_b_id) # self.match_groups[image_a_id].append(image_b_id) # num_selected_candidates += 1 # pbar.update(num_selected_candidates) # else: # num_edges_skipped_max_replication_constraint += 1 # # # print 'num_edges_skipped_max_replication_constraint: %d' % (num_edges_skipped_max_replication_constraint) # # # write out the match plan (must be sorted by key for future join stage) # metadata_entries = [] # # for batch_id, (batch_primary_image, batch_image_ids) in enumerate(self.match_groups.iteritems()): # # if not batch_image_ids: # continue # # batch_name = iwutil.Uint64ToKey(batch_id) # match_batch_metadata = iw_pb2.MatchBatchMetadata() # match_batch_metadata.image_id = batch_primary_image # match_batch_metadata.batch_name = batch_name # match_batch_metadata.is_primary = True # metadata_entries.append( match_batch_metadata ) # # for image_id in batch_image_ids: # next_metadata = iw_pb2.MatchBatchMetadata() # next_metadata.image_id = image_id # next_metadata.batch_name = batch_name # next_metadata.is_primary = False # metadata_entries.append( next_metadata ) # # # image_id will be the key of output (since we are about to join by image_id), so need to sort by image_id # metadata_entries.sort(key= lambda m : iwutil.Uint64ToKey(m.image_id)) # match_plan_writer = py_pert.ProtoTableWriter() # uri = self.GetOutput('sorted_match_batches').GetUri() # CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), uri, self.num_shards_features), 'failed to open %s' % (uri)) # to do join with features, must be sharded same way as features # for metadata in metadata_entries: # match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id), metadata.SerializeToString()) # match_plan_writer.Close() # # return #def EnsureParentPathExists(f): # d = os.path.dirname(f) # if not os.path.exists(d): # os.makedirs(d) # return #def ParseUint64KeyPair(key): # CHECK_EQ(len(key), 16) # id_a = iwutil.KeyToUint64(key[0:8]) # id_b = iwutil.KeyToUint64(key[8:16]) # return id_a, id_b #class TideObject(object): # def __init__(self): # self.id = None # self.name = None # self.image_ids = [] # return # # def LoadFromProto(self, id, tide_object_proto): # self.id = id # self.name = tide_object_proto.name # for photo in tide_object_proto.photos: # self.image_ids.append(photo.id) # return # # #class TideDataset(): # def __init__(self, tide_uri): # self.tideid_to_tideobject = {} # self.imageid_to_objectid = {} # # # load list of images that belong to each tide object # tide_reader = py_pert.StringTableReader() # tide_reader.Open(tide_uri) # for index, (k, v) in enumerate(tide_reader): # tide_object = tide_pb2.Object() # tide_object.ParseFromString(v) # obj = TideObject() # obj.LoadFromProto(index, tide_object) # self.tideid_to_tideobject[obj.id] = obj # # for tideid, tideobject in self.tideid_to_tideobject.iteritems(): # object_id = tideobject.id # for image_id in tideobject.image_ids: # self.imageid_to_objectid[image_id] = object_id # return # # def KnownImages(self, image_a_id, image_b_id): # return image_a_id in self.imageid_to_objectid and image_b_id in self.imageid_to_objectid # # def EdgeWithinCluster(self, image_a_id, image_b_id): # CHECK(self.KnownImages(image_a_id, image_b_id)) # object_a = self.imageid_to_objectid[image_a_id] # object_b = self.imageid_to_objectid[image_b_id] # return object_a == object_b # # #def CountCorrectIncorrectEdges(itergraph_state_uri, tide_uri): # itergraph_state = LoadObjectFromUri(itergraph_state_uri) # tide = TideDataset(tide_uri) # # num_within_cluster = 0 # num_cross_cluster = 0 # # object_to_num_within_cluster = {} # for tide_id in tide.tideid_to_tideobject.iterkeys(): # object_to_num_within_cluster[tide_id] = 0 # # # for edge in itergraph_state.edges: # if tide.EdgeWithinCluster(edge.image_a_id, edge.image_b_id): # num_within_cluster += 1 # object_to_num_within_cluster[tide.imageid_to_objectid[edge.image_a_id]] += 1 # else: # num_cross_cluster += 1 # # for tide_id, num in object_to_num_within_cluster.iteritems(): # print '%s: %d' % (tide.tideid_to_tideobject[tide_id].name, num) # # # return num_within_cluster, num_cross_cluster #def GetPhaseBaseUri(base_uri, phase): # return base_uri + '/phase%03d/' % (phase)