示例#1
0
def LoadImageRegionGraph(uri):
    irg = iw_pb2.ImageRegionGraph()
    reader = py_pert.StringTableReader()
    CHECK(reader.Open(uri))
    ok, tmp = reader.GetMetadata("num_edges")
    num_edges = long(tmp)
    ok, tmp = reader.GetMetadata("num_vertices")
    CHECK(ok, "this doesn't appear to be a irg uri: %s" % (uri))
    num_vertices = long(tmp)
    CHECK_EQ(reader.Entries(), num_edges + num_vertices)

    progress = iwutil.MakeProgressBar(num_edges)
    # load edges
    for i in range(num_edges):
        ok, key, value = reader.Next()
        CHECK(ok)
        CHECK_EQ(key[0], 'e')
        irg.edge.add().ParseFromString(value)
        progress.update(i)

    # load vertices
    progress = iwutil.MakeProgressBar(num_vertices)
    for i in range(num_vertices):
        ok, key, value = reader.Next()
        CHECK(ok)
        CHECK_EQ(key[0], 'v')
        irg.vertex.add().ParseFromString(value)
        progress.update(i)

    return irg
示例#2
0
    def __ImportImageArchive(self, extract_dir, dataset_root):
        image_filenames = []
        print 'Searching for image files...'
        for root, dirnames, filenames in os.walk(extract_dir):
            for filename in filenames:
                if fnmatch.fnmatch(filename, '*.jpg') or fnmatch.fnmatch(
                        filename, '*.jpeg'):
                    image_filenames.append(os.path.join(root, filename))
        pert_uri = 'local://%s/photoid_to_image.pert' % (dataset_root)

        if len(image_filenames) <= 2:
            return False

        fingerprinted_path = '%s/fingerprinted/' % (extract_dir)
        os.mkdir(fingerprinted_path)

        print 'Fingerprinting image files...'
        progress = util.MakeProgressBar(len(image_filenames))
        # rename all files according to fingerprint
        for i, filename in enumerate(image_filenames):
            data = open(filename).read()
            fp = py_base.FingerprintString(data)
            dst = '%s/%064d.jpg' % (
                fingerprinted_path, fp
            )  # ensure lexical sort = numeric sort = key sort
            os.rename(filename, dst)
            progress.update(i)

        filenames = glob.glob('%s/*.jpg' % fingerprinted_path)
        filenames.sort()
        output_uri = 'local://%s/photoid_to_image.pert' % (dataset_root)

        # write to pert in sorted order
        print 'Generating image PERT file...'
        writer = py_pert.StringTableWriter()
        CHECK(writer.Open(output_uri, 1))
        progress = util.MakeProgressBar(len(filenames))
        for i, filename in enumerate(filenames):
            data = open(filename).read()
            key = py_base.Uint64ToKey(py_base.FingerprintString(data))
            try:
                im = Image.open(StringIO.StringIO(data))
            except IOError as e:
                LOG(INFO, 'Error opening %s - %s' % (filename, e))
                continue
            width, height = im.size
            jpeg = iw_pb2.JpegImage()
            jpeg.data = data
            jpeg.width = width
            jpeg.height = height
            CHECK(jpeg.IsInitialized())
            writer.Add(key, jpeg.SerializeToString())
            progress.update(i)
        writer.Close()
        return True
示例#3
0
文件: exporter.py 项目: heathkh/iwct
 def ExportGraphJson(self):
   base_path = '%s/graph/' % self.output_path
   if os.path.exists(base_path):
     return
   LOG(INFO, 'exporting graph json...')
   os.mkdir(base_path)
   g = self.image_graph.graph
   progress = iwutil.MakeProgressBar(g.vcount())
   for i, vertex in enumerate(g.vs):
     vertex_index = vertex.index
     image_id = vertex['image_id']
     
     
     data = {'image_id': JSPad(image_id)} # need str because js can't handle 64 bit int
     neighbors = [JSPad(n['image_id']) for n in vertex.neighbors()]
     weights = [g.es[g.get_eid(vertex_index, n.index)]['weight_as_similarity'] for n in vertex.neighbors()]
     
     data['neighbors'] = neighbors
     
     tide_label = None
     try:
       tide_label = self.image_graph.tide.imageid_to_label[image_id]
     except:
       pass
     if tide_label:
       data['tide_object_id'] = tide_label.object_id
       data['tide_object_label'] = tide_label.label
               
     data['weights'] = weights      
     filename = '%s/%s.json' % (base_path, JSPad(image_id))
     open(filename, 'w').write(json.dumps(data))
     progress.update(i)
   return
示例#4
0
文件: exporter.py 项目: heathkh/iwct
 def ExportMatchesJson(self):
   base_path = '%s/matches/' % (self.output_path)
   if os.path.exists(base_path):
     return
   LOG(INFO, 'exporting match json...')
   os.mkdir(base_path)
   reader = py_pert.StringTableReader()
   CHECK(reader.Open(self.matches_uri))    
   progress = iwutil.MakeProgressBar(reader.Entries())
   match_result = iw_pb2.GeometricMatchResult()
   for i, (k,v) in enumerate(reader):
     image_a_id, image_b_id = iwutil.ParseUint64KeyPair(k)      
     match_result.ParseFromString(v)
     if not match_result.matches:
       continue
     filename = '%s/%s_%s.json' % (base_path, JSPad(image_a_id), JSPad(image_b_id))
     f = open(filename, 'w')
     data = {}
     
     CHECK_LT(image_a_id, image_b_id)
     
     data['image_a'] = JSPad(image_a_id) # use string because js can't handle 64bit int
     data['image_b'] = JSPad(image_b_id) # use string because js can't handle 64bit int
     data['image_a_size'] = self.imageid_to_size[image_a_id]
     data['image_b_size'] = self.imageid_to_size[image_b_id]
     
     matches = []
     for match in match_result.matches:
       for c in match.correspondences:
         match_info = [c.a.pos.x, c.a.pos.y, c.a.radius, c.b.pos.x, c.b.pos.y, c.b.radius]
         matches.append(match_info)
     data['matches'] = matches    
     f.write(json.dumps(data))
     progress.update(i)           
   return
示例#5
0
    def __init__(self, param_data_uri):
        self.num_configs = None
        self.rows = []
        self.configs_per_page = 10
        self.num_pages = None

        self.exclude_rows = [
            'corresondence_filter', 'type', 'dataset_name',
            'visual_vocabulary_uri', 'max_image_replication_factor',
            'max_match_batch_size', 'max_vertex_degree', 'dataset name'
        ]
        reader = py_pert.StringTableReader()
        CHECK(reader.Open(param_data_uri))
        configs = []
        progress = iwutil.MakeProgressBar(reader.Entries())
        result = lpbench_pb2.ConfigurationResult()
        for i, (k, v) in enumerate(reader):
            result.ParseFromString(v)
            config = itergraph_pb2.IterGraphParams()
            config.CopyFrom(result.config.iter_graph)
            configs.append(config)
            progress.update(i)
            #if i > 0:
            #  break

        self.__BuildTable(configs)
        return
示例#6
0
    def __init__(self, image_graph_uri, tide_uri):

        self.tide = None
        if tide_uri:
            #self.tide = tide.GetCachedTideDataset(tide_uri)
            self.tide = tide.TideDataset(tide_uri)

        ok, image_graph_data = py_imagegraph.LoadImageGraph(image_graph_uri)
        CHECK(ok)
        num_images = len(image_graph_data.vertices)

        # load proto data into an igraph graph object
        LOG(INFO, 'loading verts')
        g = igraph.Graph(num_images, directed=False)
        self.vertexid_to_imageid = [
            v.image_id for v in image_graph_data.vertices
        ]
        self.imageid_to_vertexid = {}
        for vertex_id, image_id in enumerate(self.vertexid_to_imageid):
            self.imageid_to_vertexid[image_id] = vertex_id

        for node_id, node in enumerate(image_graph_data.vertices):
            image_id = self.vertexid_to_imageid[node_id]
            node = g.vs[node_id]
            node['image_id'] = image_id

            if self.tide:
                label_data = self.tide.GetLabel(image_id)
                if label_data != None:
                    node['object_id'] = label_data.object_id
                    node['label'] = label_data.label

        num_edges = len(image_graph_data.edges)
        progress = iwutil.MakeProgressBar(num_edges)
        prev_edges = set()
        LOG(INFO, 'loading edges')
        edge_list = []
        edge_weight_values = []
        edge_nfa_values = []
        for edge_id, edge in enumerate(image_graph_data.edges):
            if (edge.src, edge.dst) in prev_edges:
                continue
            if edge.weight < 0.15:
                continue
            new_edge = (edge.src, edge.dst)
            prev_edges.add(new_edge)
            edge_list.append(new_edge)
            edge_weight_values.append(edge.weight)
            edge_nfa_values.append(edge.nfa)
            progress.update(edge_id)

        LOG(INFO, 'adding edges')
        g.add_edges(edge_list)

        LOG(INFO, 'adding edge properties')
        g.es["weight_as_similarity"] = edge_weight_values
        g.es["nfa"] = edge_nfa_values

        self.graph = g
        return
示例#7
0
文件: tide.py 项目: heathkh/iwct
def ComputeCbirPerformanceStats(tide_uri, query_results_uri):
    tide = TideDataset(tide_uri)
    reader = py_pert.StringTableShardSetReader()
    CHECK(reader.Open(query_results_uri))
    results = cbir_pb2.QueryResults()
    imageid_to_ap = {}
    progress = iwutil.MakeProgressBar(reader.Entries())
    for i, (k, v) in enumerate(reader):
        progress.update(i)
        query_image_id = iwutil.KeyToUint64(k)
        query_label = tide.GetLabel(query_image_id)
        if query_label == None or query_label.object_id == None:
            LOG(FATAL, 'unknown class for image: %d' % (query_image_id))
        query_object_id = query_label.object_id
        results.ParseFromString(v)
        query_object_num_pos = tide.NumPosImages(query_object_id)
        prev_score = float("inf")
        sum_precision = 0.0
        num_correct = 0.0
        rank = 0.0
        non_junk_results = []
        num_junk = 0
        for result in results.entries:
            label = tide.GetLabel(result.image_id)
            if label and label.object_id == query_object_id and (
                    label.label == 'neg' or label.label == 'none'):
                num_junk += 1
            else:
                non_junk_results.append(result)

        for result in non_junk_results:
            rank += 1
            CHECK_LE(
                result.score,
                prev_score)  ## check invariant: ordered decreasing by score
            prev_score = result.score
            result_label = tide.GetLabel(result.image_id)
            if not result_label or result_label.object_id == None:
                LOG(FATAL, 'unknown class: %d' % (result.image_id))

            if result_label.object_id == query_object_id:
                num_correct += 1.0
                precision = num_correct / rank
                sum_precision += precision
        average_precision = sum_precision / query_object_num_pos
        imageid_to_ap[query_image_id] = average_precision

    # compute mean average precision (across each object classes and across all classes)
    values = [v for v in imageid_to_ap.itervalues()]
    mean_average_precision = np.mean(values)

    object_mean_average_precision = {}
    for tide_object in tide.objectid_to_object.itervalues():

        object_mean_average_precision[tide_object.name] = np.mean(
            [imageid_to_ap[i] for i in tide_object.GetImageIds()])

    return mean_average_precision, object_mean_average_precision
示例#8
0
def GetImageSizes(uri):
    imageid_to_size = {}
    reader = py_pert.StringTableShardSetReader()
    CHECK(reader.Open(uri))
    jpeg_image = iw_pb2.JpegImage()
    progress = iwutil.MakeProgressBar(reader.Entries())
    for i, (k, v) in enumerate(reader):
        image_id = py_base.KeyToUint64(k)
        jpeg_image.ParseFromString(v)
        imageid_to_size[image_id] = (jpeg_image.width, jpeg_image.height)
        progress.update(i)
    return imageid_to_size
示例#9
0
def GetUriSplitInfo(uri):
    #TODO(heathkh): Can you make this faster by collecting max block size and entries without fully opening the reader?

    uris = py_pert.GetShardUris(uri)
    num_entries = 0
    max_block_size = 0
    progress = iwutil.MakeProgressBar(len(uris))
    for i, uri in enumerate(uris):
        reader = py_pert.StringTableShardReader()
        reader.Open(uri)
        max_block_size = max(max_block_size, reader.MaxBlockSize())
        num_entries += reader.Entries()
        progress.update(i)
    return num_entries, max_block_size
示例#10
0
def CreateIndex(features_uri, index_uri):
  reader = py_pert.StringTableShardSetReader()
  CHECK(reader.Open(features_uri))
  features = iw_pb2.ImageFeatures()
  
  include_keypoints = True
  index = py_cbir.FeatureIndex(include_keypoints)
  progress = iwutil.MakeProgressBar(reader.Entries())  
  for i, (k,v) in enumerate(reader):    
    image_id = iwutil.KeyToUint64(k)
    features.ParseFromString(v)
    index.Add(image_id, features)      
    progress.update(i)
  progress.finish()    
  index.Build()  
  return index.Save(index_uri)
示例#11
0
def main():
    crop_fraction = 0.05
    base_uri = 'local://home/ubuntu/Desktop/vol-7f209e0c/itergraph/tide_v08_distractors/'
    orig_image_uri = '%s/photoid_to_image.pert' % (base_uri)
    tide_uri = '%s/objectid_to_object.pert' % (base_uri)
    new_tide_uri = '%s/cropped_objectid_to_object.pert' % (base_uri)
    #cropped_image_uri = '%s/cropped_scaled_photoid_to_image.pert' % (base_uri)

    orig_sizes_dict = GetCachedImageSizes(orig_image_uri)
    reader = py_pert.StringTableReader()
    writer = py_pert.ProtoTableWriter()

    tide_object = tide_pb2.Object()

    CHECK(writer.Open(tide_object, new_tide_uri, 1))
    CHECK(reader.Open(tide_uri))
    progress = iwutil.MakeProgressBar(reader.Entries())
    for i, (k, v) in enumerate(reader):
        tide_object.ParseFromString(v)
        CHECK(tide_object.IsInitialized())

        # adjust the bb of all the photos
        for photo in tide_object.photos:
            CHECK(photo.id in orig_sizes_dict)
            width, height = orig_sizes_dict[photo.id]
            try:
                crop_rect = crop.CropRect(width, height, crop_fraction)
                for region in photo.regions:
                    bb1_x, bb1_y, bb1_w, bb1_h = region.x1, region.y1, region.x2 - region.x1, region.y2 - region.y1
                    bb2_x, bb2_y, bb2_w, bb2_h = crop_rect.ApplyCropToRect(
                        bb1_x, bb1_y, bb1_w, bb1_h)
                    region.x1 = bb2_x
                    region.y1 = bb2_y
                    region.x2 = bb2_x + bb2_w
                    region.y2 = bb2_y + bb2_h
            except ValueError:
                print 'crop failed, not adjusting bb'

        # write adjusted proto to output
        writer.Add(k, tide_object.SerializeToString())
        progress.update(i)

    return
示例#12
0
def main():
  
  dataset_root_uri = 'local://media/vol-0449ca74/itergraph/'
  input_dataset_name = 'tide_v08'
  output_dataset_name = 'tide_v08_distractors'
  distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert'
  
  input_tide_uri = '%s/%s/objectid_to_object.pert' % (dataset_root_uri, input_dataset_name)
  output_tide_uri = '%s/%s/objectid_to_object.pert' % (dataset_root_uri, output_dataset_name)
  input_images_uri = '%s/%s/photoid_to_image.pert' % (dataset_root_uri, input_dataset_name)
  output_images_uri = '%s/%s/photoid_to_image.pert' % (dataset_root_uri, output_dataset_name)


  #merge images 
  CHECK(py_pert.MergeTables( [input_images_uri, distractor_images_uri], output_images_uri))
    
  objectid_to_object = LoadTide(input_tide_uri)
  distractors = tide.ImageSequenceReader(distractor_images_uri)  
  num_distractors = distractors.GetNumImages()
  num_objects = len(objectid_to_object) 
  num_distractors_per_object = int(float(num_distractors)/num_objects)
  
  # for each object
  progress = iwutil.MakeProgressBar(num_distractors)
  count = 0
  for object_id, object in objectid_to_object.iteritems():
    # augment object with distractor images
    for i in range(num_distractors_per_object):
      image_id, jpeg = distractors.GetNextImage()
      new_photo = object.photos.add()
      new_photo.id = image_id
      new_photo.label = tide_pb2.NEGATIVE
      count += 1
      progress.update(count)
  
  # save tide pert in output directory
  SaveTide(objectid_to_object, output_tide_uri)
  
  
  
  return 
示例#13
0
def main():
    dataset_name = 'tide_v08'
    sizes = {}
    sizes['thumbnail'] = 100 * 100
    sizes['small'] = 640 * 480
    reset_bucket = False

    #dataset_base_uri = 'local://home/ubuntu/Desktop/vol-0449ca74/itergraph/%s/' % (dataset_name)
    #images_uri = '%s/cropped_scaled_photoid_to_image.pert' % (dataset_base_uri)
    images_uri = 'local://home/ubuntu/Desktop/vol-0449ca74/itergraph/tide_v14/cropped_scaled_photoid_to_image_randomaccess.pert'
    bucket_name = 'tide_image_cache'
    s3 = boto.connect_s3()

    bucket = s3.create_bucket(bucket_name)
    if reset_bucket:
        LOG(INFO, 'listing contents of bucket...')
        all_keys = [key.name for key in bucket.list()]
        LOG(INFO, 'deleting contents of bucket...')
        bucket.delete_keys(all_keys)
        s3.delete_bucket(bucket_name)
        bucket = s3.create_bucket(bucket_name)
        bucket.set_acl('public-read')

    reader = py_pert.StringTableReader()
    CHECK(reader.Open(images_uri))
    progress = iwutil.MakeProgressBar(reader.Entries())

    num_workers = 200
    max_queue_size = 200
    job_queue = JobQueue(num_workers, max_queue_size)
    for i, (key, value) in enumerate(reader):
        image_id = py_base.KeyToUint64(key)
        jpeg_image = iw_pb2.JpegImage()
        jpeg_image.ParseFromString(value)
        job_queue.AddJob(
            ResizeAndUploadImageJob(bucket, sizes, image_id, jpeg_image.data))
        progress.update(i)

    job_queue.WaitForJobsDone()

    return
示例#14
0
文件: exporter.py 项目: heathkh/iwct
 def ExportImages(self):
   image_size_cache_filename = '%s/images/size_cache.pickle' % self.output_path
   if os.path.exists(image_size_cache_filename):
     self.imageid_to_size = iwutil.LoadObject(image_size_cache_filename)
     return
   base_path = '%s/images/' % (self.output_path)
   os.mkdir(base_path)
   LOG(INFO, 'exporting images...')
   reader = py_pert.StringTableReader()
   CHECK(reader.Open(self.images_uri))    
   jpeg_image = iw_pb2.JpegImage()
   progress = iwutil.MakeProgressBar(reader.Entries())
   for i, (k,v) in enumerate(reader):      
     image_id = py_base.KeyToUint64(k)
     jpeg_image.ParseFromString(v)      
     filename = '%s/%s.jpg' % (base_path, JSPad(image_id))
     f = open(filename, 'wb')
     f.write(jpeg_image.data)
     f.close()
     self.imageid_to_size[image_id] = (jpeg_image.width, jpeg_image.height)
     progress.update(i)          
   iwutil.SaveObject(self.imageid_to_size, image_size_cache_filename)    
   return
示例#15
0
文件: util.py 项目: heathkh/iwct
 def Run(self):             
   bow_uri = self.GetInput('bow').GetUri()
   reader = py_pert.StringTableReader()    
   CHECK(reader.Open(bow_uri))    
   visual_vocab_size = self.cbir_bow_params.visual_vocab_size
   num_docs = reader.Entries()
   index = None
   if self.cbir_bow_params.implementation == 'inria':
     index = py_inria.InriaIndex()
   elif self.cbir_bow_params.implementation == 'ctis':
     index = py_ctis.CtisIndex()
     index.StartCreate(visual_vocab_size, num_docs)
   else:
     LOG(FATAL, 'unexpected')  
   
   #vv_uri = self.GetInput('visual_vocab').GetUri()
   temp_ivf_filepath = tempfile.mkdtemp()
       
   bag_of_words = bow_pb2.BagOfWords()
   progress = iwutil.MakeProgressBar(reader.Entries())
   for i, (key, value) in enumerate(reader):
     image_id = iwutil.KeyToUint64(key)
     bag_of_words.ParseFromString(value)
     index.Add(image_id, bag_of_words)
     progress.update(i)
   
   index.Save(temp_ivf_filepath)
   
   
   py_pert.Remove(self.index_base_uri)
   mr.CopyUri('local://' + temp_ivf_filepath , self.index_base_uri)    
   CHECK(py_pert.Exists(self.index_base_uri + '/index.ivf'))
   CHECK(py_pert.Exists(self.index_base_uri + '/index.ivfids'))
   
   shutil.rmtree(temp_ivf_filepath, ignore_errors=True)
   return True     
示例#16
0
def main():
#  images_uri = 'local://media/vol-0449ca74/itergraph/tide_v14/cropped_scaled_photoid_to_image_randomaccess.pert'
#  tide_uri = 'local://media/vol-0449ca74/itergraph/tide_v14/objectid_to_object.pert'
#  distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert'  
#  output_base_uri = 'local://media/vol-0449ca74/itergraph/tide_v14_mixed_v2/'
  
#  images_uri = 'local://media/vol-0449ca74/itergraph/tide_v16/photoid_to_image.pert'
#  tide_uri = 'local://media/vol-0449ca74/itergraph/tide_v16/objectid_to_object.pert'
#  distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert'  
#  output_base_uri = 'local://media/vol-0449ca74/itergraph/tide_v16_mixed/'

  images_uri = 'local://media/vol-0449ca74/itergraph/tide_v18/photoid_to_image.pert'
  tide_uri = 'local://media/vol-0449ca74/itergraph/tide_v18/objectid_to_object.pert'
  distractor_images_uri = 'local://media/vol-0449ca74/oxc1_100k/photoid_to_image.pert'  
  output_base_uri = 'local://media/vol-0449ca74/itergraph/tide_v18_mixed/'
  
  output_tide_uri = '%s/objectid_to_object.pert' % (output_base_uri)
  output_images_uri = '%s/photoid_to_image.pert'  % (output_base_uri)
  
  image_loader = visutil.BatchImageLoader(images_uri)  
  tide_objects = OpenTideDataset(tide_uri)
  distractor_image_loader = DistractorImageLoader(distractor_images_uri)
  CHECK_EQ(len(tide_objects), 2)
  object_a = tide_objects[0]
  object_b = tide_objects[1]
   
  new_object_a = InitNewObject(object_a) 
  new_object_b = InitNewObject(object_b)
  
  a_none_image_ids = [photo.id for photo in object_a.photos if photo.label == tide_pb2.NONE ]  
  b_none_image_ids = [photo.id for photo in object_b.photos if photo.label == tide_pb2.NONE ]  
  
  mixed_aux_images = {}
  for i, (imageid_a, imageid_b) in enumerate(zip(a_none_image_ids, b_none_image_ids)):    
    mixed_aux_images[imageid_a] = (imageid_a, imageid_b)

  mixed_image_ids = mixed_aux_images.keys()
  n = int(len(mixed_image_ids)/2.0)
  InitNoneLabels(new_object_a, mixed_image_ids[0:n])
  InitNoneLabels(new_object_b, mixed_image_ids[n:-1])
  new_objects = [new_object_a, new_object_b]
  
  image_ids = []
  for obj in new_objects:
    for photo in obj.photos:
      image_ids.append(photo.id)
  
  image_ids.sort()
  
  # write new tide pert
  tide_writer = py_pert.ProtoTableWriter()  
  tide_writer.Open(tide_pb2.Object(), output_tide_uri, 1)
  for obj in new_objects:
    tide_writer.Add(iwutil.Uint64ToKey(obj.id), obj.SerializeToString())  
  tide_writer.Close()
  
  # write new image pert
  try:
    image_writer = py_pert.ProtoTableWriter()  
    image_writer.Open(iw_pb2.JpegImage(), output_images_uri, 1)
    used_image_ids = set()
    progress = iwutil.MakeProgressBar(len(image_ids))
    for i, image_id in enumerate(image_ids):
      jpeg = None
      if image_id in mixed_aux_images:
        imageid_a, imageid_b = mixed_aux_images[image_id]
        jpeg_a = image_loader.GetImage(imageid_a)
        jpeg_b = image_loader.GetImage(imageid_b)
        if jpeg_a == None or jpeg_b == None:
          LOG(INFO, 'skipping missing jpeg') 
          continue
        jpeg = CreateMixedJpeg(jpeg_a, jpeg_b)
      else:
        distractor = None
        while True:
          distractor = distractor_image_loader.GetNextImage()
          if distractor.width > distractor.height:
            break
        CHECK(distractor)
        jpeg = CreateMixedJpeg(image_loader.GetImage(image_id), 
                               distractor)
      
      CHECK(image_id not in used_image_ids)
      CHECK(jpeg)
      image_writer.Add(iwutil.Uint64ToKey(image_id), jpeg.SerializeToString())
      used_image_ids.add(image_id)
      progress.update(i)
    image_writer.Close()
  except:
    pass  
  
                                                        
  return 
示例#17
0
文件: itergraph.py 项目: heathkh/iwct
    def Run(self):
        LOG(
            INFO,
            'waiting to let running processes give up memory... I need a lot and may not get enough if we rush things...'
        )
        time.sleep(30)
        itergraph_state = LoadObjectFromUri(
            self.GetInput('prev_state').GetUri())
        reader = py_pert.StringTableReader()
        CHECK(reader.Open(self.GetInput('candidates').GetUri()))
        self.match_groups = {}
        num_selected_candidates = 0

        pbar = iwutil.MakeProgressBar(self.max_candidates_per_phase)
        num_edges_skipped_max_degree_constraint = 0
        num_edges_skipped_max_replication_constraint = 0
        prev_score = -float('inf')

        for ordering_key, candidate_pair_data in reader:
            image_a_id, image_b_id = iwutil.ParseUint64KeyPair(
                candidate_pair_data)
            if itergraph_state.PreviouslyAttempted(image_a_id, image_b_id):
                #print 'skipping previous attempted edge'
                continue
            # check precondition... candidates pert is sorted (increasing by rank or by negative cbir score)
            score = iwutil.KeyToDouble(ordering_key)
            CHECK_GE(score, prev_score)
            prev_score = score

            if image_a_id not in self.match_groups:
                self.match_groups[image_a_id] = []

            match_group_size = len(self.match_groups[image_a_id])

            if match_group_size < self.max_batch_size:
                # test vertex degree condition
                degree_a = itergraph_state.GetDegree(image_a_id)
                degree_b = itergraph_state.GetDegree(image_b_id)

                # version 1: skip candidate edge if either of the vertices has many edges
                #if degree_a < self.max_vertex_degree and degree_b < self.max_vertex_degree:

                # version 2: skip candidate edge only if both of the vertices have many edges
                if degree_a < self.max_vertex_degree or degree_b < self.max_vertex_degree:
                    # test max replication condition
                    num_replications = self._GetNumReplications(image_b_id)
                    if num_replications < self.max_replication_factor:
                        self._IncrementReplications(image_b_id)
                        self.match_groups[image_a_id].append(image_b_id)
                        num_selected_candidates += 1
                        pbar.update(num_selected_candidates)
                    else:
                        num_edges_skipped_max_replication_constraint += 1
                else:
                    num_edges_skipped_max_degree_constraint += 1

            if num_selected_candidates >= self.max_candidates_per_phase:
                break

        pbar.finish()

        print ''
        print ''
        print 'num_edges_skipped_max_replication_constraint: %d' % (
            num_edges_skipped_max_replication_constraint)
        print 'num_edges_skipped_max_degree_constraint: %d' % (
            num_edges_skipped_max_degree_constraint)
        print ''
        print ''

        # write out the match plan (must be sorted by key for future join stage)
        metadata_entries = []

        for batch_id, (batch_primary_image, batch_image_ids) in enumerate(
                self.match_groups.iteritems()):
            if len(batch_image_ids) == 0:
                continue
            batch_name = iwutil.Uint64ToKey(batch_id)
            CHECK(batch_name)
            CHECK(len(batch_name))
            match_batch_metadata = iw_pb2.MatchBatchMetadata()
            match_batch_metadata.image_id = batch_primary_image
            match_batch_metadata.batch_name = batch_name
            match_batch_metadata.is_primary = True
            metadata_entries.append(match_batch_metadata)

            for image_id in batch_image_ids:
                next_metadata = iw_pb2.MatchBatchMetadata()
                next_metadata.image_id = image_id
                next_metadata.batch_name = batch_name
                next_metadata.is_primary = False
                metadata_entries.append(next_metadata)

        # image_id will be the key of output, so need to sort by image_id
        metadata_entries.sort(key=lambda m: m.image_id)
        match_batches_uri = self.GetOutput('sorted_match_batches').GetUri()

        # TODO(heathkh): "closing" doesn't flush to disk... this is a bug!
        #    match_plan_writer = py_pert.ProtoTableWriter()
        #    num_shards_features = py_pert.GetNumShards(self.features.GetUri())
        #    CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), match_batches_uri, num_shards_features))
        #    for metadata in metadata_entries:
        #      CHECK(metadata.IsInitialized())
        #      key = iwutil.Uint64ToKey(metadata.image_id)
        #      CHECK(match_plan_writer.Add(key, metadata.SerializeToString()))
        #    match_plan_writer.Close()

        # TODO(kheath):   Work around for above bug is to run a MR stage to reshard
        tmp_match_batches_uri = self.GetOutput(
            'sorted_match_batches').GetUri() + '_to_be_sharded'
        match_plan_writer = py_pert.ProtoTableWriter()
        num_shards_features = py_pert.GetNumShards(self.features.GetUri())
        CHECK(
            match_plan_writer.Open(iw_pb2.MatchBatchMetadata(),
                                   tmp_match_batches_uri, 1))

        for metadata in metadata_entries:
            CHECK(metadata.IsInitialized())
            CHECK(
                match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id),
                                      metadata.SerializeToString()))
        match_plan_writer.Close()

        # manually reshard
        pertedit_bin = 'pertedit'
        cmd = '%s --input %s --output %s --new_block_size_mb=10 --num_output_shards=%d' % (
            pertedit_bin, tmp_match_batches_uri, match_batches_uri,
            num_shards_features)
        print cmd
        CHECK_EQ(ExecuteCmd(cmd), 0)

        CHECK(py_pert.Exists(match_batches_uri))

        ok, fp = py_pert.GetShardSetFingerprint(match_batches_uri)
        CHECK(ok)
        CHECK(len(fp), 32)
        CHECK_NE(fp, 'd41d8cd98f00b204e9800998ecf8427e',
                 'invalid hash of empty string')

        return
示例#18
0
文件: itergraph.py 项目: heathkh/iwct
    def Run(self):
        itergraph_state = LoadObjectFromUri(
            self.GetInput('prev_state').GetUri())
        if True:  # hack to put this block in it's own scope to force release of memory resources
            reader = py_pert.StringTableReader()
            CHECK(reader.Open(self.GetInput('match_results').GetUri()))
            match_result = iw_pb2.GeometricMatchResult()
            num_entries = reader.Entries()
            if num_entries:
                pbar = iwutil.MakeProgressBar(num_entries)
                for i, (k, v) in enumerate(reader):
                    pbar.update(i)
                    match_result.ParseFromString(v)
                    success = False
                    for match in match_result.matches:
                        if match.nfa < -20:
                            success = True
                    if success:
                        itergraph_state.AddSuccesfulEdge(
                            match_result.image_a_id, match_result.image_b_id,
                            match_result.properties.score, self.phase)
                    else:
                        itergraph_state.AddFailedEdge(match_result.image_a_id,
                                                      match_result.image_b_id)
            print 'edges: %d' % (len(itergraph_state.edges))
        SaveObjectToUri(itergraph_state,
                        self.GetOutput('itergraph_state').GetUri())
        return


#def EvalImageGraph(itergraph_state_uri, tide_uri):
#  # compute edge stats
#  num_within_cluster, num_cross_cluster = CountCorrectIncorrectEdges(itergraph_state_uri, tide_uri)
#
#  # compute label prop performance
#  itergraph_state = LoadObjectFromUri(itergraph_state_uri)
#  eval_graph_uri = 'local://tmp/eval3_graph.pert'
#  itergraph_state.SaveAsEval3Graph(eval_graph_uri)
#
#  num_training_images = 100
#  num_trials = 4
#  evaluation = py_eval3.EvaluationRunner(eval_graph_uri, tide_uri, num_training_images, num_trials)
#  ok, result = evaluation.Run()
#  CHECK(ok)
#
#  return num_within_cluster, num_cross_cluster, result
#
#
#class EvalImageGraphFlow(core.Flow):
#  def __init__(self, base_uri, itergraph_state, tide):
#    super(EvalImageGraphFlow,self).__init__()
#    self.base_uri = base_uri
#    self.AddInput('itergraph_state', itergraph_state)
#    self.AddInput('tide', tide)
#
#    self.AddOutput('eval', core.FileResource(self, '%s/eval.txt' % (base_uri)))
#
#    return
#
#
#  def Run(self):
#    # compute edge stats
#    itergraph_state_uri = self.GetInput('itergraph_state').GetUri()
#    tide_uri = self.GetInput('tide').GetUri()
#
#    num_within_cluster, num_cross_cluster, result = EvalImageGraph(itergraph_state_uri, tide_uri)
#
#    lines = []
#    lines.append('num_within_cluster: %d' % (num_within_cluster))
#    lines.append('num_cross_cluster: %d' % (num_cross_cluster))
#    lines.append('fraction within cluster: %f' % (float(num_within_cluster)/ (num_within_cluster + num_cross_cluster)))
#    lines.append(str(result))
#    report = '\n'.join(lines)
#    print report
#
#    path = mr.UriToNfsPath(self.GetOutput('eval').GetUri())
#    f = open(path, 'w')
#    f.write(report)
#
#    return
#

#class BasicMatchBatchPlanningFlow(core.Flow):
#  def __init__(self, base_uri, candidates, max_batch_size, max_replication_factor, num_shards_features):
#    super(BasicMatchBatchPlanningFlow,self).__init__()
#
#    self.AddInput('candidates', candidates)
#    self.AddOutput('sorted_match_batches', core.PertResource(self, '%s/sorted_match_batches.pert' % (base_uri)))
#
#    self.max_batch_size = max_batch_size
#    self.max_replication_factor = max_replication_factor
#
#    self.num_shards_features = num_shards_features
#    self.match_groups = {}
#    self.num_replications = {}
#    return
#
#  def _GetNumReplications(self, image_id):
#    replications = 0
#    if image_id in self.num_replications:
#      replications = self.num_replications[image_id]
#    return replications
#
#  def _IncrementReplications(self, image_id):
#    if image_id not in self.num_replications:
#      self.num_replications[image_id] = 0
#    self.num_replications[image_id] += 1
#    return
#
#  def Run(self):
#    reader = py_pert.StringTableReader()
#    reader.Open(self.GetInput('candidates').GetUri())
#
#    self.match_groups = {}
#    num_selected_candidates = 0
#    prev_score = -1e6
#
#    widgets = [Percentage(), ' ', Bar(), ' ', ETA()]
#    pbar = ProgressBar(widgets=widgets, maxval=reader.Entries()).start()
#
#    num_edges_skipped_max_replication_constraint = 0
#    for i, (k, v) in enumerate(reader):
#      image_a_id, image_b_id = ParseUint64KeyPair(v)
#
#      # check precondition... pert is sorted by scores
#      score = iwutil.KeyToDouble(k)
#      CHECK_GE(score, prev_score)
#      prev_score = score
#
#      if image_a_id not in self.match_groups:
#        self.match_groups[image_a_id] = []
#
#      match_group_size = len(self.match_groups[image_a_id])
#
#      if match_group_size < self.max_batch_size:
#        # test max replication condition
#        num_replications = self._GetNumReplications(image_b_id)
#        if num_replications < self.max_replication_factor:
#          self._IncrementReplications(image_b_id)
#          self.match_groups[image_a_id].append(image_b_id)
#          num_selected_candidates += 1
#          pbar.update(num_selected_candidates)
#        else:
#          num_edges_skipped_max_replication_constraint += 1
#
#
#    print 'num_edges_skipped_max_replication_constraint: %d' % (num_edges_skipped_max_replication_constraint)
#
#    # write out the match plan (must be sorted by key for future join stage)
#    metadata_entries = []
#
#    for batch_id, (batch_primary_image, batch_image_ids) in enumerate(self.match_groups.iteritems()):
#
#      if not batch_image_ids:
#        continue
#
#      batch_name = iwutil.Uint64ToKey(batch_id)
#      match_batch_metadata = iw_pb2.MatchBatchMetadata()
#      match_batch_metadata.image_id = batch_primary_image
#      match_batch_metadata.batch_name = batch_name
#      match_batch_metadata.is_primary = True
#      metadata_entries.append( match_batch_metadata )
#
#      for image_id in batch_image_ids:
#        next_metadata = iw_pb2.MatchBatchMetadata()
#        next_metadata.image_id = image_id
#        next_metadata.batch_name = batch_name
#        next_metadata.is_primary = False
#        metadata_entries.append( next_metadata )
#
#    # image_id will be the key of output (since we are about to join by image_id), so need to sort by image_id
#    metadata_entries.sort(key= lambda m : iwutil.Uint64ToKey(m.image_id))
#    match_plan_writer = py_pert.ProtoTableWriter()
#    uri = self.GetOutput('sorted_match_batches').GetUri()
#    CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), uri, self.num_shards_features), 'failed to open %s' % (uri))  # to do join with features, must be sharded same way as features
#    for metadata in metadata_entries:
#      match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id), metadata.SerializeToString())
#    match_plan_writer.Close()
#
#    return

#def EnsureParentPathExists(f):
#  d = os.path.dirname(f)
#  if not os.path.exists(d):
#    os.makedirs(d)
#  return

#def ParseUint64KeyPair(key):
#  CHECK_EQ(len(key), 16)
#  id_a = iwutil.KeyToUint64(key[0:8])
#  id_b = iwutil.KeyToUint64(key[8:16])
#  return id_a, id_b

#class TideObject(object):
#  def __init__(self):
#    self.id = None
#    self.name = None
#    self.image_ids = []
#    return
#
#  def LoadFromProto(self, id, tide_object_proto):
#    self.id = id
#    self.name = tide_object_proto.name
#    for photo in tide_object_proto.photos:
#      self.image_ids.append(photo.id)
#    return
#
#
#class TideDataset():
#  def __init__(self, tide_uri):
#    self.tideid_to_tideobject = {}
#    self.imageid_to_objectid = {}
#
#    # load list of images that belong to each tide object
#    tide_reader = py_pert.StringTableReader()
#    tide_reader.Open(tide_uri)
#    for index, (k, v) in enumerate(tide_reader):
#      tide_object = tide_pb2.Object()
#      tide_object.ParseFromString(v)
#      obj = TideObject()
#      obj.LoadFromProto(index, tide_object)
#      self.tideid_to_tideobject[obj.id] = obj
#
#    for tideid, tideobject in self.tideid_to_tideobject.iteritems():
#      object_id = tideobject.id
#      for image_id in tideobject.image_ids:
#        self.imageid_to_objectid[image_id] = object_id
#    return
#
#  def KnownImages(self, image_a_id, image_b_id):
#    return image_a_id in self.imageid_to_objectid and image_b_id in self.imageid_to_objectid
#
#  def EdgeWithinCluster(self, image_a_id, image_b_id):
#    CHECK(self.KnownImages(image_a_id, image_b_id))
#    object_a = self.imageid_to_objectid[image_a_id]
#    object_b = self.imageid_to_objectid[image_b_id]
#    return object_a == object_b
#

#
#def CountCorrectIncorrectEdges(itergraph_state_uri, tide_uri):
#  itergraph_state = LoadObjectFromUri(itergraph_state_uri)
#  tide = TideDataset(tide_uri)
#
#  num_within_cluster = 0
#  num_cross_cluster = 0
#
#  object_to_num_within_cluster = {}
#  for tide_id in tide.tideid_to_tideobject.iterkeys():
#    object_to_num_within_cluster[tide_id] = 0
#
#
#  for edge in itergraph_state.edges:
#    if tide.EdgeWithinCluster(edge.image_a_id, edge.image_b_id):
#      num_within_cluster += 1
#      object_to_num_within_cluster[tide.imageid_to_objectid[edge.image_a_id]] += 1
#    else:
#      num_cross_cluster += 1
#
#  for tide_id, num in object_to_num_within_cluster.iteritems():
#    print '%s: %d' % (tide.tideid_to_tideobject[tide_id].name, num)
#
#
#  return num_within_cluster, num_cross_cluster

#def GetPhaseBaseUri(base_uri,  phase):
#    return base_uri + '/phase%03d/' % (phase)