예제 #1
0
파일: util.py 프로젝트: heathkh/iwct
 def __init__(self, output_uri, raw_images, crop_fraction, min_dimension_pixels, min_area_pixels, max_area_pixels):
   super(ScrubImagesFlow, self).__init__()    
   self.parameters['crop_fraction'] = crop_fraction
   self.parameters['min_dimension_pixels'] = min_dimension_pixels
   self.parameters['min_area_pixels'] = min_area_pixels
   self.parameters['max_area_pixels'] = max_area_pixels
   
   self.AddInput('raw_images', raw_images)    
   self.AddOutput('scrubbed_images', core.PertResource(self, '%s/photoid_to_image.pert' % (output_uri)))
   self.SetPipesBinary(__file__, 'mr_scrub_images')
   self.num_reduce_jobs = py_pert.GetNumShards(raw_images.GetUri())
   return
예제 #2
0
    def Run(self):

        print 'pid: %s' % os.getpid()
        print 'id(py_pert): %s' % id(py_pert)
        ok, scheme, path = py_pert.ParseUri(self.uri)
        print 'path: %s' % path
        print 'exists: %s' % py_pert.Exists(self.uri)
        if py_pert.Exists(self.uri):
            print 'num shards: %s' % py_pert.GetNumShards(self.uri)

        super(MyOperation, self).Run()

        return True
예제 #3
0
파일: parallel_1.py 프로젝트: heathkh/iwct
    def Run(self):
        print 'pid: %s' % os.getpid()
        print 'id(py_pert): %s' % id(py_pert)
        ok, scheme, path = py_pert.ParseUri(self.uri)
        print 'path: %s' % path
        print 'exists: %s' % py_pert.Exists(self.uri)
        if py_pert.Exists(self.uri):
            print 'num shards: %s' % py_pert.GetNumShards(self.uri)
            reader = py_pert.StringTableReader()
            print 'about to open reader'
            reader.Open(self.uri)
            print 'about to use reader'
            count = 0
            for k, v in reader:
                print k
                count += 1
                if count > 5:
                    break

        return True
예제 #4
0
def ComputeMaxNumSplits(input_uri):
    """ Calculate the number of splits that are possible for a given pert file."""
    total_size_bytes = CalculatePertFileSize(input_uri)
    num_shards = py_pert.GetNumShards(input_uri)
    num_entries, max_block_size = GetUriSplitInfo(input_uri)
    min_split_size = max_block_size
    if min_split_size == 0:
        LOG(FATAL, 'The input is empty: %s' % input_uri)

    if not num_entries:
        LOG(FATAL, 'pert file has no entries: %s' % input_uri)

    # if have more splits than this, then one of them will be too small
    max_num_splits = max(1, int(total_size_bytes / float(min_split_size)))

    #print 'total_size_bytes: %d' % (total_size_bytes)
    #print 'min_split_size: %d' % (min_split_size)
    #print 'max_num_splits: %d' % (max_num_splits)

    # we can't have more splits than we have entries
    if num_entries < max_num_splits:
        max_num_splits = num_entries

    return max_num_splits
예제 #5
0
파일: itergraph.py 프로젝트: heathkh/iwct
    def Run(self):
        LOG(
            INFO,
            'waiting to let running processes give up memory... I need a lot and may not get enough if we rush things...'
        )
        time.sleep(30)
        itergraph_state = LoadObjectFromUri(
            self.GetInput('prev_state').GetUri())
        reader = py_pert.StringTableReader()
        CHECK(reader.Open(self.GetInput('candidates').GetUri()))
        self.match_groups = {}
        num_selected_candidates = 0

        pbar = iwutil.MakeProgressBar(self.max_candidates_per_phase)
        num_edges_skipped_max_degree_constraint = 0
        num_edges_skipped_max_replication_constraint = 0
        prev_score = -float('inf')

        for ordering_key, candidate_pair_data in reader:
            image_a_id, image_b_id = iwutil.ParseUint64KeyPair(
                candidate_pair_data)
            if itergraph_state.PreviouslyAttempted(image_a_id, image_b_id):
                #print 'skipping previous attempted edge'
                continue
            # check precondition... candidates pert is sorted (increasing by rank or by negative cbir score)
            score = iwutil.KeyToDouble(ordering_key)
            CHECK_GE(score, prev_score)
            prev_score = score

            if image_a_id not in self.match_groups:
                self.match_groups[image_a_id] = []

            match_group_size = len(self.match_groups[image_a_id])

            if match_group_size < self.max_batch_size:
                # test vertex degree condition
                degree_a = itergraph_state.GetDegree(image_a_id)
                degree_b = itergraph_state.GetDegree(image_b_id)

                # version 1: skip candidate edge if either of the vertices has many edges
                #if degree_a < self.max_vertex_degree and degree_b < self.max_vertex_degree:

                # version 2: skip candidate edge only if both of the vertices have many edges
                if degree_a < self.max_vertex_degree or degree_b < self.max_vertex_degree:
                    # test max replication condition
                    num_replications = self._GetNumReplications(image_b_id)
                    if num_replications < self.max_replication_factor:
                        self._IncrementReplications(image_b_id)
                        self.match_groups[image_a_id].append(image_b_id)
                        num_selected_candidates += 1
                        pbar.update(num_selected_candidates)
                    else:
                        num_edges_skipped_max_replication_constraint += 1
                else:
                    num_edges_skipped_max_degree_constraint += 1

            if num_selected_candidates >= self.max_candidates_per_phase:
                break

        pbar.finish()

        print ''
        print ''
        print 'num_edges_skipped_max_replication_constraint: %d' % (
            num_edges_skipped_max_replication_constraint)
        print 'num_edges_skipped_max_degree_constraint: %d' % (
            num_edges_skipped_max_degree_constraint)
        print ''
        print ''

        # write out the match plan (must be sorted by key for future join stage)
        metadata_entries = []

        for batch_id, (batch_primary_image, batch_image_ids) in enumerate(
                self.match_groups.iteritems()):
            if len(batch_image_ids) == 0:
                continue
            batch_name = iwutil.Uint64ToKey(batch_id)
            CHECK(batch_name)
            CHECK(len(batch_name))
            match_batch_metadata = iw_pb2.MatchBatchMetadata()
            match_batch_metadata.image_id = batch_primary_image
            match_batch_metadata.batch_name = batch_name
            match_batch_metadata.is_primary = True
            metadata_entries.append(match_batch_metadata)

            for image_id in batch_image_ids:
                next_metadata = iw_pb2.MatchBatchMetadata()
                next_metadata.image_id = image_id
                next_metadata.batch_name = batch_name
                next_metadata.is_primary = False
                metadata_entries.append(next_metadata)

        # image_id will be the key of output, so need to sort by image_id
        metadata_entries.sort(key=lambda m: m.image_id)
        match_batches_uri = self.GetOutput('sorted_match_batches').GetUri()

        # TODO(heathkh): "closing" doesn't flush to disk... this is a bug!
        #    match_plan_writer = py_pert.ProtoTableWriter()
        #    num_shards_features = py_pert.GetNumShards(self.features.GetUri())
        #    CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), match_batches_uri, num_shards_features))
        #    for metadata in metadata_entries:
        #      CHECK(metadata.IsInitialized())
        #      key = iwutil.Uint64ToKey(metadata.image_id)
        #      CHECK(match_plan_writer.Add(key, metadata.SerializeToString()))
        #    match_plan_writer.Close()

        # TODO(kheath):   Work around for above bug is to run a MR stage to reshard
        tmp_match_batches_uri = self.GetOutput(
            'sorted_match_batches').GetUri() + '_to_be_sharded'
        match_plan_writer = py_pert.ProtoTableWriter()
        num_shards_features = py_pert.GetNumShards(self.features.GetUri())
        CHECK(
            match_plan_writer.Open(iw_pb2.MatchBatchMetadata(),
                                   tmp_match_batches_uri, 1))

        for metadata in metadata_entries:
            CHECK(metadata.IsInitialized())
            CHECK(
                match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id),
                                      metadata.SerializeToString()))
        match_plan_writer.Close()

        # manually reshard
        pertedit_bin = 'pertedit'
        cmd = '%s --input %s --output %s --new_block_size_mb=10 --num_output_shards=%d' % (
            pertedit_bin, tmp_match_batches_uri, match_batches_uri,
            num_shards_features)
        print cmd
        CHECK_EQ(ExecuteCmd(cmd), 0)

        CHECK(py_pert.Exists(match_batches_uri))

        ok, fp = py_pert.GetShardSetFingerprint(match_batches_uri)
        CHECK(ok)
        CHECK(len(fp), 32)
        CHECK_NE(fp, 'd41d8cd98f00b204e9800998ecf8427e',
                 'invalid hash of empty string')

        return