Exemplo n.º 1
0
    def parallelize_bounding_box( self,
                                  instance_name,
                                  bounding_box_zyx,
                                  grid,
                                  target_partition_size_voxels ):
        """
        Create an RDD for the given data instance (of either grayscale, labelblk, labelarray, or labelmap),
        within the given bounding_box (start_zyx, stop_zyx) and split into blocks of the given shape.
        The RDD parallelism will be set to include approximately target_partition_size_voxels in total.
        """
        block_size_voxels = np.prod(grid.block_shape)
        rdd_partition_length = target_partition_size_voxels // block_size_voxels

        bricks = generate_bricks_from_volume_source( bounding_box_zyx,
                                                     grid,
                                                     self.get_volume_accessor(instance_name),
                                                     self.sc,
                                                     rdd_partition_length )
        
        # If we're working with a tiny volume (e.g. testing),
        # make sure we at least parallelize across all cores.
        if bricks.getNumPartitions() < cpus_per_worker() * num_worker_nodes():
            bricks = bricks.repartition( cpus_per_worker() * num_worker_nodes() )

        return bricks
Exemplo n.º 2
0
    def _sanitize_config(self):
        """
        Tidy up some config values, and fill in 'auto' values where needed.
        """
        input_config = self.config_data["input"]
        output_config = self.config_data["output"]
        options = self.config_data["options"]

        # Initialize dummy input/output services, just to overwrite 'auto' config values as needed.
        VolumeService.create_from_config(input_config, self.config_dir)

        # Output bounding-box must match exactly (or left as auto)
        input_bb_zyx = self.input_service.bounding_box_zyx
        output_bb_zyx = self.output_service.bounding_box_zyx
        assert ((output_bb_zyx == input_bb_zyx) | (output_bb_zyx == -1)).all(), \
            "Output bounding box must match the input bounding box exactly. (No translation permitted)."

        assert output_config["slice-files"]["slice-xy-offset"] == [
            0, 0
        ], "Nonzero xy offset is meaningless for outputs."

        if options["slices-per-slab"] == -1:
            # Auto-choose a depth that keeps all threads busy with at least one slice
            brick_shape_zyx = self.input_service.preferred_message_shape
            brick_depth = brick_shape_zyx[0]
            assert brick_depth != -1
            num_threads = num_worker_nodes() * cpus_per_worker()
            threads_per_brick_layer = (
                (num_threads + brick_depth - 1) // brick_depth)  # round up
            options["slices-per-slab"] = brick_depth * threads_per_brick_layer
def launch_spark_cluster(job_name, num_spark_workers, max_hours, job_log_dir):
    num_nodes = num_spark_workers + 1 # Add one for master
    num_slots = num_nodes * cpus_per_worker()
    
    job = Bjob( 'dummy-string',
                name=f'{job_name}-cluster',
                app_env=f'sparkbatch32({SPARK_VERSION})',
                num_slots=num_slots,
                max_runtime_minutes=int(max_hours * 60),
                stdout_file=f'{job_log_dir}/{job_name}-cluster.log' )

    try:
        print("Launching spark cluster:")
        master_job_id, queue_name, master_hostname = job.submit()
        assert queue_name == 'spark32', f"Unexpected queue name for master job: {queue_name}"

        print(f'...master ({master_job_id}) is running on http://{master_hostname}:8080\n')
        
        rtm_url = get_hostgraph_url(master_job_id)
        print(f"Cluster host graphs:\n{rtm_url}")
        
        return master_job_id, master_hostname

    except KeyboardInterrupt:
        if job.job_id:
            print(f"Interrupted. Killing job {job.job_id}")
            kill_job(job.job_id)
        raise
    def _sanitize_config(self):
        """
        Tidy up some config values, and fill in 'auto' values where needed.
        """
        input_config = self.config_data["input"]
        output_config = self.config_data["output"]
        options = self.config_data["options"]

        # Initialize dummy input/output services, just to overwrite 'auto' config values as needed.
        VolumeService.create_from_config( input_config, self.config_dir )

        # Output bounding-box must match exactly (or left as auto)
        input_bb_zyx = self.input_service.bounding_box_zyx
        output_bb_zyx = self.output_service.bounding_box_zyx
        assert ((output_bb_zyx == input_bb_zyx) | (output_bb_zyx == -1)).all(), \
            "Output bounding box must match the input bounding box exactly. (No translation permitted)."

        assert output_config["slice-files"]["slice-xy-offset"] == [0,0], "Nonzero xy offset is meaningless for outputs."

        if options["slices-per-slab"] == -1:
            # Auto-choose a depth that keeps all threads busy with at least one slice
            brick_shape_zyx = self.input_service.preferred_message_shape
            brick_depth = brick_shape_zyx[0]
            assert brick_depth != -1
            num_threads = num_worker_nodes() * cpus_per_worker()
            threads_per_brick_layer = ((num_threads + brick_depth-1) // brick_depth) # round up
            options["slices-per-slab"] = brick_depth * threads_per_brick_layer
Exemplo n.º 5
0
    def _execute_labelindices(self, mapping_df):
        config = self.config_data
        options = config["options"]
        resource_manager_client = ResourceManagerClient(
            options["resource-server"], options["resource-port"])

        last_mutid = options["mutation-id"]
        server = config["dvid"]["server"]
        uuid = config["dvid"]["uuid"]
        instance_name = config["dvid"]["segmentation-name"]
        endpoint = f'{server}/api/node/{uuid}/{instance_name}/indices'

        processor = StatsBatchProcessor(last_mutid, endpoint)

        # Load the h5 file
        block_sv_stats = load_stats_h5_to_records(config["block-stats-file"])

        # Note: Initializing this generator involves sorting the (very large) stats array
        batch_rows = options["batch-row-count"]
        batch_generator = generate_stats_batches(block_sv_stats, mapping_df,
                                                 batch_rows)

        batches = self.sc.parallelize(batch_generator,
                                      cpus_per_worker() * num_worker_nodes())
        rt.persist_and_execute(batches, "Distributing batches", logger)

        def process_batch(item):
            stats_batch, total_rows = item
            approximate_bytes = 30 * total_rows  # this is highly unscientific
            with resource_manager_client.access_context(
                    server, False, 1, approximate_bytes):
                processor.process_batch((stats_batch, total_rows))

        with Timer("Processing/sending batches", logger):
            batches.foreach(process_batch)
Exemplo n.º 6
0
    def group_by_body(self, segments_and_meshes):
        config = self.config_data

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":

            def last_six_digits(id_mesh):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id

            grouped_body_ids_and_meshes = segments_and_meshes.groupBy(
                last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = self.load_labelmap()

            def prepend_mapped_group_id(id_mesh_partition):
                df = pd.DataFrame(mapping_pairs,
                                  columns=["body_id", "group_id"])

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append((group_id, (body_id, mesh)))
                return new_partition

            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = segments_and_meshes.map(
                lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]))

        persist_and_execute(
            grouped_body_ids_and_meshes,
            f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        return grouped_body_ids_and_meshes
Exemplo n.º 7
0
    def from_accessor_func(cls, bounding_box, grid, volume_accessor_func=None, sc=None, target_partition_size_voxels=None, sparse_boxes=None, lazy=False):
        """
        Convenience constructor, taking an arbitrary volume_accessor_func.
        
        Args:
            bounding_box:
                (start, stop)
     
            grid:
                Grid (see brick.py)
     
            volume_accessor_func:
                Callable with signature: f(box) -> ndarray
                Note: The callable will be unpickled only once per partition, so initialization
                      costs after unpickling are only incurred once per partition.
     
            sc:
                SparkContext. If provided, an RDD is returned.  Otherwise, returns an ordinary Python iterable.
     
            target_partition_size_voxels:
                Optional. If provided, the RDD partition lengths (i.e. the number of bricks per RDD partition)
                will be chosen to have (approximately) this many total voxels in each partition.
            
            sparse_boxes:
                A list of (physical) sparse boxes indicating which bricks should actually be present in the BrickWall.
                If not provided, all bricks within the bounding_box will be present. 

            lazy:
                If True, the bricks' data will not be created until their 'volume' member is first accessed.
        """
        if target_partition_size_voxels is None:
            if sc:
                num_threads = num_worker_nodes() * cpus_per_worker()
            else:
                # See RDDtools -- for now, non-spark pseudo-RDDs are just a single partition.
                num_threads = 1

            if sparse_boxes is None:
                total_voxels = np.prod(bounding_box[1] - bounding_box[0])
            else:
                if not hasattr(sparse_boxes, '__len__'):
                    sparse_boxes = list(sparse_boxes)
                total_voxels = sum( map(lambda physbox: np.prod(physbox[1] - physbox[0]), sparse_boxes ) )
            
            voxels_per_thread = total_voxels / num_threads
            target_partition_size_voxels = (voxels_per_thread // 2) # Arbitrarily aim for 2 partitions per thread

        block_size_voxels = np.prod(grid.block_shape)
        rdd_partition_length = target_partition_size_voxels // block_size_voxels

        bricks = generate_bricks_from_volume_source(bounding_box, grid, volume_accessor_func, sc, rdd_partition_length, sparse_boxes, lazy)
        return BrickWall( bounding_box, grid, bricks )
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--driver-slots', type=int, default=cpus_per_worker())
    parser.add_argument('--driver-node-type', choices=['sandy', 'haswell', 'broadwell', 'avx2'])
    parser.add_argument('--driver-queue', help='For example, "spark-drivers"')
    parser.add_argument('--job-log-dir', type=str, default='.')
    parser.add_argument('--max-hours', type=float, default=8)
    parser.add_argument('--job-name')
    parser.add_argument('num_spark_workers', type=int)
    parser.add_argument('workflow_name')
    parser.add_argument('config_file')
    args = parser.parse_args()

    if not args.job_name:
        config_name = splitext(basename(args.config_file))[0]
        args.job_name = config_name + '-{:%Y%m%d.%H%M%S}'.format(datetime.now())

    setup_environment(args.num_spark_workers, args.config_file, args.job_log_dir)
    
    master_job_id = driver_job_id = None
    
    try:
        master_job_id, master_hostname = launch_spark_cluster( args.job_name,
                                                               args.num_spark_workers,
                                                               args.max_hours + 10/60, # 10 extra minutes for the spark cluster;  
                                                               args.job_log_dir)       # it's easier to make sense of the logs when the driver dies first.
    
        driver_job_id, _driver_hostname = launch_driver_job( master_job_id,
                                                             master_hostname,
                                                             args.driver_slots,
                                                             args.driver_node_type,
                                                             args.driver_queue, # e.g. 'spark-drivers'
                                                             args.job_log_dir,
                                                             args.max_hours,
                                                             args.job_name,
                                                             args.workflow_name,
                                                             args.config_file )
    except BaseException as ex:
        if isinstance(ex, KeyboardInterrupt):
            print("User Interrupted!")
        if master_job_id:
            print(f"Killing master (job {master_job_id})")
            kill_job(master_job_id)
        if driver_job_id:
            print(f"Killing driver (job {driver_job_id})")
            kill_job(driver_job_id)
        if isinstance(ex, KeyboardInterrupt):
            return 1
    
    return 0
    def _process_slab(self, scale, slab_fullres_box_zyx, slab_index, num_slabs, upscale_slab_wall):
        num_threads = num_worker_nodes() * cpus_per_worker()
        slab_voxels = np.prod(slab_fullres_box_zyx[1] - slab_fullres_box_zyx[0]) // (2**scale)**3
        voxels_per_thread = slab_voxels // num_threads

        options = self.config_data["options"]
        pyramid_source = options["pyramid-source"]
        
        if pyramid_source == "copy" or scale == 0:
            # Copy from input source
            bricked_slab_wall = BrickWall.from_volume_service(self.input_service, scale, slab_fullres_box_zyx, self.sc, voxels_per_thread // 2)
            bricked_slab_wall.persist_and_execute(f"Slab {slab_index}: Downloading scale {scale}", logger)
        else:
            # Downsample from previous scale
            bricked_slab_wall = upscale_slab_wall.downsample( (2,2,2), 'grayscale' )
            bricked_slab_wall.persist_and_execute(f"Slab {slab_index}: Downsampling to scale {scale}", logger)
            upscale_slab_wall.unpersist()
            del upscale_slab_wall

        if scale == 0:
            bricked_slab_wall = self.adjust_contrast(bricked_slab_wall, slab_index)
        
        # Remap to output bricks
        output_grid = Grid(self.output_service.preferred_message_shape)
        output_slab_wall = bricked_slab_wall.realign_to_new_grid( output_grid )
        
        # Pad from previously-existing pyramid data until
        # we have full storage blocks, e.g. (64,64,64),
        # but not necessarily full bricks, e.g. (64,64,6400)
        output_accessor_func = partial(self.output_service.get_subvolume, scale=scale)

        # But don't bother fetching real data for scale 0
        # the input slabs are already block-aligned, and the edges of each slice will be zeros anyway.
        if scale == 0:
            output_accessor_func = lambda _box: 0

        padding_grid = Grid( 3*(self.output_service.block_width,), output_grid.offset )
        padded_slab_wall = output_slab_wall.fill_missing(output_accessor_func, padding_grid)
        padded_slab_wall.persist_and_execute(f"Slab {slab_index}: Assembling scale {scale} bricks", logger)

        # Discard original bricks
        bricked_slab_wall.unpersist()
        del bricked_slab_wall

        logger.info(f"Slab {slab_index}: Writing scale {scale}", extra={"status": f"Writing {slab_index}/{num_slabs}"})
        rt.foreach( partial(write_brick, self.output_service, scale), padded_slab_wall.bricks )

        return padded_slab_wall
Exemplo n.º 10
0
    def group_by_body(self, segments_and_meshes):
        config = self.config_data

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":
            def last_six_digits( id_mesh ):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id
            grouped_body_ids_and_meshes = segments_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = self.load_labelmap()

            def prepend_mapped_group_id( id_mesh_partition ):
                df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] )

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append( (group_id, (body_id, mesh)) )
                return new_partition
            
            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = segments_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = segments_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) )

        persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        return grouped_body_ids_and_meshes
Exemplo n.º 11
0
def generate_bricks_from_volume_source( bounding_box, grid, volume_accessor_func, sc=None, rdd_partition_length=None, sparse_boxes=None, lazy=False ):
    """
    Generate an RDD or iterable of Bricks for the given bounding box and grid.
     
    Args:
        bounding_box:
            (start, stop)
 
        grid:
            Grid (see above)
 
        volume_accessor_func:
            Callable with signature: f(box) -> ndarray
            Note: The callable will be unpickled only once per partition, so initialization
                  costs after unpickling are only incurred once per partition.
 
        sc:
            SparkContext. If provided, an RDD is returned.  Otherwise, returns an ordinary Python iterable.
 
        rdd_partition_length:
            Optional. If provided, the RDD will have (approximately) this many bricks per partition.
        
        sparse_boxes:
            Optional.
            A pre-calculated list of boxes to use instead of instead of calculating
            the complete (dense) list of grid boxes within the bounding box.
            If provided, should be a list of physical boxes, and no two should occupy
            the same logical box, as defined by their midpoints.
            Note: They will still be clipped to the overall bounding_box.
        
        halo: An integer or shape indicating how much halo to add to each Brick's physical_box.
              The halo is applied in both 'dense' and 'sparse' cases.
    """
    if sparse_boxes is None:
        # Generate boxes from densely populated grid
        logical_boxes = boxes_from_grid(bounding_box, grid, include_halos=False)
        physical_boxes = clipped_boxes_from_grid(bounding_box, grid)
        logical_and_physical_boxes = zip( logical_boxes, physical_boxes )
    else:
        # User provided list of physical boxes.
        # Clip them to the bounding box and calculate the logical boxes.
        if not hasattr(sparse_boxes, '__len__'):
            sparse_boxes = list( sparse_boxes )
        physical_boxes = np.asarray( sparse_boxes )
        assert physical_boxes.ndim == 3 and physical_boxes.shape[1:3] == (2,3)

        def logical_and_clipped( box ):
            midpoint = (box[0] + box[1]) // 2
            logical_box = grid.compute_logical_box( midpoint )
            box += (-grid.halo_shape, grid.halo_shape)
            # Note: Non-intersecting boxes will have non-positive shape after clipping
            clipped_box = box_intersection(box, bounding_box)
            return ( logical_box, clipped_box )

        logical_and_physical_boxes = map(logical_and_clipped, physical_boxes)

        # Drop any boxes that fall completely outside the bounding box
        # Check that physical box doesn't completely fall outside its logical_box
        def is_valid(logical_and_physical):
            logical_box, physical_box = logical_and_physical
            return (physical_box[1] > logical_box[0]).all() and (physical_box[0] < logical_box[1]).all()
        logical_and_physical_boxes = filter(is_valid, logical_and_physical_boxes )

    if sc:
        if not hasattr(logical_and_physical_boxes, '__len__'):
            logical_and_physical_boxes = list(logical_and_physical_boxes) # need len()

        num_rdd_partitions = None
        if rdd_partition_length is not None:
            rdd_partition_length = max(1, rdd_partition_length)
            num_rdd_partitions = int( np.ceil( len(logical_and_physical_boxes) / rdd_partition_length ) )

        # If we're working with a tiny volume (e.g. testing),
        # make sure we at least parallelize across all cores.
        if num_rdd_partitions is not None and (num_rdd_partitions < cpus_per_worker() * num_worker_nodes()):
            num_rdd_partitions = cpus_per_worker() * num_worker_nodes()

        def brick_size(log_phys):
            _logical, physical = log_phys
            return np.uint64(np.prod(physical[1] - physical[0]))
        total_volume = sum(map(brick_size, logical_and_physical_boxes))
        logger.info(f"Initializing RDD of {len(logical_and_physical_boxes)} Bricks "
                    f"(over {num_rdd_partitions} partitions) with total volume {total_volume/1e9:.1f} Gvox")

        # Enumerate and repartition to get perfect partition sizes,
        # rather than relying on spark's default hash
        class _enumerated_value(tuple):
            # Return a hash based on the key alone.
            def __hash__(self):
                return self[0]

        enumerated_logical_and_physical_boxes = sc.parallelize( enumerate(logical_and_physical_boxes), num_rdd_partitions )
        enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.map(_enumerated_value)
        enumerated_logical_and_physical_boxes = enumerated_logical_and_physical_boxes.partitionBy(num_rdd_partitions, lambda x: x)
        logical_and_physical_boxes = enumerated_logical_and_physical_boxes.values()

    def make_bricks( logical_and_physical_box ):
        logical_box, physical_box = logical_and_physical_box
        if lazy:
            return Brick(logical_box, physical_box, lazy_creation_fn=volume_accessor_func)
        else:
            volume = volume_accessor_func(physical_box)
            return Brick(logical_box, physical_box, volume)
    
    bricks = rt.map( make_bricks, logical_and_physical_boxes )
    return bricks
    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data
        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)
        
        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map( logged_generate_mesh )
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes", logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log. 
        errors = body_ids_and_meshes_with_err.map(lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )
                                                          
        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":
            def last_six_digits( id_mesh ):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id
            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap( config["mesh-config"]["storage"]["labelmap"], self.config_dir )

            def prepend_mapped_group_id( id_mesh_partition ):
                df = pd.DataFrame( mapping_pairs, columns=["body_id", "group_id"] )

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append( (group_id, (body_id, mesh)) )
                return new_partition
            
            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map( lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]) )

        persist_and_execute(grouped_body_ids_and_meshes, f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes
        
        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition( partial(post_meshes_to_dvid, config) )
        logger.info(f"Writing meshes to DVID took {timer.seconds}")
Exemplo n.º 13
0
    def execute(self):
        self._init_services()
        self._sanitize_config()

        options = self.config_data["options"]

        output_service = self.output_service
        logger.info(
            f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}")

        # Data is processed in Z-slabs
        slab_depth = options["slices-per-slab"]

        input_bb_zyx = self.input_service.bounding_box_zyx
        _, slice_start_y, slice_start_x = input_bb_zyx[0]

        slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0]
        slab_shape_zyx[0] = slab_depth

        slice_shape_zyx = slab_shape_zyx.copy()
        slice_shape_zyx[0] = 1

        # This grid outlines the slabs -- each grid box is a full slab
        slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x))
        slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid))

        for slab_index, slab_box_zyx in enumerate(slab_boxes):
            # Contruct BrickWall from input bricks
            num_threads = num_worker_nodes() * cpus_per_worker()
            slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0])
            voxels_per_thread = slab_voxels / num_threads

            bricked_slab_wall = BrickWall.from_volume_service(
                self.input_service, 0, slab_box_zyx, self.sc,
                voxels_per_thread / 2)

            # Force download
            bricked_slab_wall.persist_and_execute(
                f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}",
                logger)

            # Remap to slice-sized "bricks"
            sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0])
            sliced_slab_wall = bricked_slab_wall.realign_to_new_grid(
                sliced_grid)
            sliced_slab_wall.persist_and_execute(
                f"Assembling slab {slab_index}/{len(slab_boxes)} slices",
                logger)

            # Discard original bricks
            bricked_slab_wall.unpersist()
            del bricked_slab_wall

            def write_slice(brick):
                assert (brick.physical_box == brick.logical_box).all()
                output_service.write_subvolume(brick.volume,
                                               brick.physical_box[0])

            # Export to PNG or TIFF, etc. (automatic via slice path extension)
            with Timer() as timer:
                logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}",
                            extra={
                                "status":
                                f"Exporting {slab_index}/{len(slab_boxes)}"
                            })
                rt.foreach(write_slice, sliced_slab_wall.bricks)
            logger.info(
                f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}",
                extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"})

            # Discard slice data
            sliced_slab_wall.unpersist()
            del sliced_slab_wall

        logger.info(f"DONE exporting {len(slab_boxes)} slabs.",
                    extra={'status': "DONE"})
Exemplo n.º 14
0
    def _execute_mesh_generation(self, large_id_box_mask_factor_err):
        config = self.config_data

        @self.collect_log(lambda _: '_MESH_GENERATION_ERRORS')
        def logged_generate_mesh(arg):
            return generate_mesh_in_subprocess(config, arg)

        #     --> (body_id, mesh_bytes, error_msg)
        body_ids_and_meshes_with_err = large_id_box_mask_factor_err.map(
            logged_generate_mesh)
        persist_and_execute(body_ids_and_meshes_with_err, "Computing meshes",
                            logger)

        # Errors were already written to a separate file, but let's duplicate them in the master log.
        errors = body_ids_and_meshes_with_err.map(
            lambda id_mesh_err: id_mesh_err[-1]).filter(bool).collect()
        for error in errors:
            logger.error(error)

        # Filter out error cases
        body_ids_and_meshes = body_ids_and_meshes_with_err.filter(lambda id_mesh_err: id_mesh_err[-1] is None) \
                                                          .map( lambda id_mesh_err: id_mesh_err[:2] )

        # Group according to scheme
        grouping_scheme = config["mesh-config"]["storage"]["grouping-scheme"]
        n_partitions = num_worker_nodes() * cpus_per_worker()

        if grouping_scheme in "hundreds":

            def last_six_digits(id_mesh):
                body_id, _mesh = id_mesh
                group_id = body_id - (body_id % 100)
                return group_id

            grouped_body_ids_and_meshes = body_ids_and_meshes.groupBy(
                last_six_digits, numPartitions=n_partitions)

        elif grouping_scheme == "labelmap":
            import pandas as pd
            mapping_pairs = load_labelmap(
                config["mesh-config"]["storage"]["labelmap"], self.config_dir)

            def prepend_mapped_group_id(id_mesh_partition):
                df = pd.DataFrame(mapping_pairs,
                                  columns=["body_id", "group_id"])

                new_partition = []
                for id_mesh in id_mesh_partition:
                    body_id, mesh = id_mesh
                    rows = df.loc[df.body_id == body_id]
                    if len(rows) == 0:
                        # If missing from labelmap,
                        # we assume an implicit identity mapping
                        group_id = body_id
                    else:
                        group_id = rows['group_id'].iloc[0]
                    new_partition.append((group_id, (body_id, mesh)))
                return new_partition

            # We do this via mapPartitions().groupByKey() instead of a simple groupBy()
            # to save time constructing the DataFrame inside the closure above.
            # (TODO: Figure out why the dataframe isn't pickling properly...)
            skip_groups = set(config["mesh-config"]["storage"]["skip-groups"])
            grouped_body_ids_and_meshes = body_ids_and_meshes.mapPartitions( prepend_mapped_group_id ) \
                                                             .filter(lambda item: item[0] not in skip_groups) \
                                                             .groupByKey(numPartitions=n_partitions)
        elif grouping_scheme in ("singletons", "no-groups"):
            # Create 'groups' of one item each, re-using the body ID as the group id.
            # (The difference between 'singletons', and 'no-groups' is in how the mesh is stored, below.)
            grouped_body_ids_and_meshes = body_ids_and_meshes.map(
                lambda id_mesh: (id_mesh[0], [(id_mesh[0], id_mesh[1])]))

        persist_and_execute(
            grouped_body_ids_and_meshes,
            f"Grouping meshes with scheme: '{grouping_scheme}'", logger)
        unpersist(body_ids_and_meshes)
        del body_ids_and_meshes

        with Timer() as timer:
            grouped_body_ids_and_meshes.foreachPartition(
                partial(post_meshes_to_dvid, config))
        logger.info(f"Writing meshes to DVID took {timer.seconds}")
Exemplo n.º 15
0
    def execute(self):
        self._init_services()
        self._sanitize_config()

        options = self.config_data["options"]

        output_service = self.output_service
        logger.info(f"Output bounding box: {output_service.bounding_box_zyx[:,::-1]}")

        # Data is processed in Z-slabs
        slab_depth = options["slices-per-slab"]

        input_bb_zyx = self.input_service.bounding_box_zyx
        _, slice_start_y, slice_start_x = input_bb_zyx[0]

        slab_shape_zyx = input_bb_zyx[1] - input_bb_zyx[0]
        slab_shape_zyx[0] = slab_depth

        slice_shape_zyx = slab_shape_zyx.copy()
        slice_shape_zyx[0] = 1

        # This grid outlines the slabs -- each grid box is a full slab
        slab_grid = Grid(slab_shape_zyx, (0, slice_start_y, slice_start_x))
        slab_boxes = list(clipped_boxes_from_grid(input_bb_zyx, slab_grid))

        for slab_index, slab_box_zyx in enumerate(slab_boxes):
            # Contruct BrickWall from input bricks
            num_threads = num_worker_nodes() * cpus_per_worker()
            slab_voxels = np.prod(slab_box_zyx[1] - slab_box_zyx[0])
            voxels_per_thread = slab_voxels / num_threads

            bricked_slab_wall = BrickWall.from_volume_service(self.input_service, 0, slab_box_zyx, self.sc, voxels_per_thread / 2)

            # Force download
            bricked_slab_wall.persist_and_execute(f"Downloading slab {slab_index}/{len(slab_boxes)}: {slab_box_zyx[:,::-1]}", logger)
            
            # Remap to slice-sized "bricks"
            sliced_grid = Grid(slice_shape_zyx, offset=slab_box_zyx[0])
            sliced_slab_wall = bricked_slab_wall.realign_to_new_grid( sliced_grid )
            sliced_slab_wall.persist_and_execute(f"Assembling slab {slab_index}/{len(slab_boxes)} slices", logger)

            # Discard original bricks
            bricked_slab_wall.unpersist()
            del bricked_slab_wall

            def write_slice(brick):
                assert (brick.physical_box == brick.logical_box).all()
                output_service.write_subvolume(brick.volume, brick.physical_box[0])

            # Export to PNG or TIFF, etc. (automatic via slice path extension)
            with Timer() as timer:
                logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)}", extra={"status": f"Exporting {slab_index}/{len(slab_boxes)}"})
                rt.foreach( write_slice, sliced_slab_wall.bricks )
            logger.info(f"Exporting slab {slab_index}/{len(slab_boxes)} took {timer.timedelta}",
                        extra={"status": f"Done: {slab_index}/{len(slab_boxes)}"})
            
            # Discard slice data
            sliced_slab_wall.unpersist()
            del sliced_slab_wall

        logger.info(f"DONE exporting {len(slab_boxes)} slabs.", extra={'status': "DONE"})