def main(): parser = argparse.ArgumentParser() parser.add_argument('--neuprint-server', '-n', default='neuprint.janelia.org') parser.add_argument('--dataset', '-d') parser.add_argument('--init', '-i', choices=['groundtruth', 'random']) parser.add_argument('--verbose', '-v', action='store_true') parser.add_argument('--debug', action='store_true') parser.add_argument('--min-weight', '-w', default=10, type=int) args = parser.parse_args() c = Client(args.neuprint_server, args.dataset) export_dir = f"{c.dataset}-w{args.min_weight}-from-{args.init}" os.makedirs(export_dir, exist_ok=True) # Fetch connectome (and export) with Timer("Fetching/exporting connectome", logger): criteria = NC(status='Traced', cropped=False, client=c) neuron_df, roi_conn_df = fetch_adjacencies(criteria, criteria, min_total_weight=args.min_weight, export_dir=export_dir, properties=['type', 'instance'], client=c) conn_df = roi_conn_df.groupby(['bodyId_pre', 'bodyId_post'], as_index=False)['weight'].sum() strong_connections_df, g, nbs, partition_df = infer_hierarchy(neuron_df, conn_df, args.min_weight, args.init, args.verbose, args.debug) with Timer("Exporting inference results", logger): pickle.dump(g, open(f'{export_dir}/graph.pkl', 'wb')) pickle.dump(nbs, open(f'{export_dir}/nested-block-state.pkl', 'wb')) pickle.dump(partition_df, open(f'{export_dir}/partition_df.pkl', 'wb')) pickle.dump(strong_connections_df, open(f'{export_dir}/strong_connections_df.pkl', 'wb')) logger.info("DONE")
def execute(self): self._init_services() self._validate_config() options = self.config["copygrayscale"] input_bb_zyx = self.input_service.bounding_box_zyx min_scale = options["min-pyramid-scale"] max_scale = options["max-pyramid-scale"] starting_slice = options["starting-slice"] axis_name = options["slab-axis"] axis = 'zyx'.index(axis_name) slab_boxes = list(slabs_from_box(input_bb_zyx, options["slab-depth"], slab_cutting_axis=axis)) logger.info(f"Processing volume in {len(slab_boxes)} slabs") for slab_index, slab_fullres_box_zyx in enumerate(slab_boxes): if slab_fullres_box_zyx[0, axis] < starting_slice: logger.info(f"Slab {slab_index}: SKIPPING. {slab_fullres_box_zyx[:,::-1].tolist()}") continue with Timer() as slab_timer: logger.info(f"Slab {slab_index}: STARTING. {slab_fullres_box_zyx[:,::-1].tolist()}") slab_wall = None for scale in range(0, max_scale+1): with Timer() as scale_timer: slab_wall = self._process_slab(scale, slab_fullres_box_zyx, slab_index, len(slab_boxes), slab_wall, min_scale) logger.info(f"Slab {slab_index}: Scale {scale} took {scale_timer.timedelta}") logger.info(f"Slab {slab_index}: DONE. ({slab_timer.timedelta})", extra={'status': f"DONE with slab {slab_index}"}) logger.info(f"DONE exporting {len(slab_boxes)} slabs")
def main(): # Hard-coded parameters prod = 'emdata4:8900' master = (prod, find_master(prod)) master_seg = (*master, 'segmentation') # I accidentally corrupted the labelindex of bodies in this region patch_box = 20480 + np.array([[0, 0, 0], [1024, 1024, 1024]]) with Timer("Fetching supervoxels", logger): boxes = boxes_from_grid(patch_box, Grid((64, 64, 6400)), clipped=True) sv_sets = compute_parallel(partial(_fetch_svs, master_seg), boxes, processes=32, ordered=False, leave_progress=True) svs = set(chain(*sv_sets)) - set([0]) bodies = set(fetch_mapping(*master_seg, svs)) with Timer(f"Repairing {len(bodies)} labelindexes", logger): compute_parallel(partial(_repair_index, master_seg), bodies, processes=32, ordered=False, leave_progress=True) print("DONE.")
def sv_to_mesh(server, uuid, instance, sv, smoothing_iterations=0, simplification_fraction=1.0, max_box_volume=DEFAULT_MAX_BOUNDING_BOX_VOL): """ Download a mask for the given supervoxel and generate a mesh from it. If the mask bounding box would be large at scale 0, a smaller scale will be used. The returned mesh will always use scale-0 coordinates, though. """ with Timer("Fetching supervoxel mask", logger): mask, scale, scaled_box = fetch_supervoxel_mask( server, uuid, instance, sv, max_box_volume) fullres_box = scaled_box * (2**scale) with Timer(f"Generating mesh from scale {scale}", logger): mesh = Mesh.from_binary_vol(mask, fullres_box) with Timer(f"Smoothing ({smoothing_iterations})", logger): mesh.laplacian_smooth(smoothing_iterations) # If we chose a scale other than 0, automatically reduce the # amount of decimation, since there will already be fewer vertices at lower resolution. simplification_fraction *= (2**scale)**2 simplification_fraction = min(1.0, simplification_fraction) with Timer(f"Decimating ({simplification_fraction})", logger): mesh.simplify(simplification_fraction, in_memory=True) logger.info( f"Mesh has {len(mesh.vertices_zyx)} vertices and {len(mesh.faces)} faces" ) return mesh
def load_stats_h5_to_records(h5_path): """ Read a block segment statistics HDF5 file. The file should contain a dataset named 'stats', whose dtype is the same as STATS_DTYPE, but possibly without a 'body_id' column. If the dataset contains no 'body_id' column, one is prepended to the result (as a copy of the segment_id column). Returns: (block_sv_stats, presorted_by, agglomeration_path) where: block_sv_stats: ndarray with dtype=STATS_DTYPE presorted_by: One of the following: - None: stats are not sorted - 'segment_id': stats were sorted by the 'segment_id' column - 'body_id': stats were sorted by the 'body_id' column agglomeration_path: A path pointing to the agglomeration mapping which was used to produce the 'body_id' column when the file was saved. """ with h5py.File(h5_path, 'r') as f: dset = f['stats'] with Timer(f"Allocating RAM for {len(dset)} block stats rows", logger): block_sv_stats = np.empty(dset.shape, dtype=STATS_DTYPE) if 'body_id' in dset.dtype.names: dest_view = block_sv_stats else: full_view = block_sv_stats.view([('body_col', [STATS_DTYPE[0]]), ('other_cols', STATS_DTYPE[1:])]) dest_view = full_view['other_cols'] with Timer(f"Loading block stats into RAM", logger): h5_batch_size = 1_000_000 for batch_start in range(0, len(dset), h5_batch_size): batch_stop = min(batch_start + h5_batch_size, len(dset)) dest_view[batch_start:batch_stop] = dset[ batch_start:batch_stop] if 'body_id' not in dset.dtype.names: block_sv_stats['body_id'] = block_sv_stats['segment_id'] try: presorted_by = dset.attrs['presorted-by'] assert presorted_by in ('segment_id', 'body_id') except KeyError: presorted_by = None agglomeration_path = None if presorted_by == 'body_id': agglomeration_path = dset.attrs['agglomeration-mapping-path'] return block_sv_stats, presorted_by, agglomeration_path
def init_brickwall(self, volume_service, subset_labels, roi): sbm = None if roi["name"]: base_service = volume_service.base_service if not roi["server"] or not roi["uuid"]: assert isinstance(base_service, DvidVolumeService), \ "Since you aren't using a DVID input source, you must specify the ROI server and uuid." roi["server"] = (roi["server"] or volume_service.server) roi["uuid"] = (roi["uuid"] or volume_service.uuid) if roi["scale"] is not None: scale = roi["scale"] elif isinstance(volume_service, ScaledVolumeService): scale = volume_service.scale_delta assert scale <= 5, \ "The 'roi' option doesn't support volumes downscaled beyond level 5" else: scale = 0 brick_shape = volume_service.preferred_message_shape assert not (brick_shape % 2**(5-scale)).any(), \ "If using an ROI, select a brick shape that is divisible by 32" seg_box = volume_service.bounding_box_zyx seg_box = round_box(seg_box, 2**(5-scale)) seg_box_s0 = seg_box * 2**scale seg_box_s5 = seg_box // 2**(5-scale) with Timer(f"Fetching mask for ROI '{roi['name']}' ({seg_box_s0[:, ::-1].tolist()})", logger): roi_mask_s5, _ = fetch_roi(roi["server"], roi["uuid"], roi["name"], format='mask', mask_box=seg_box_s5) # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0. sbm = SparseBlockMask(roi_mask_s5, seg_box, 2**(5-scale)) elif subset_labels: try: sbm = volume_service.sparse_block_mask_for_labels([*subset_labels]) if ((sbm.box[1] - sbm.box[0]) == 0).any(): raise RuntimeError("Could not find sparse masks for any of the subset-labels") except NotImplementedError: sbm = None with Timer("Initializing BrickWall", logger): # Aim for 2 GB RDD partitions when loading segmentation GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes # Apply halo WHILE downloading the data. # TODO: Allow the user to configure whether or not the halo should # be fetched from the outset, or added after the blocks are loaded. halo = self.config["connectedcomponents"]["halo"] brickwall = BrickWall.from_volume_service(volume_service, 0, None, self.client, target_partition_size_voxels, halo, sbm, compression='lz4_2x') return brickwall
def infer_hierarchy(neuron_df, connection_df, min_weight=10, init='groundtruth', verbose=True, special_debug=False): ## ## TODO: If filtering connections for min_weight drops some neurons entirely, they should be removed from neuron_df ## lsf_slots = os.environ.get('LSB_DJOB_NUMPROC', default=0) if lsf_slots: os.environ['OMP_NUM_THREADS'] = lsf_slots logger.info(f"Using {lsf_slots} CPUs for OpenMP") assert init in ('groundtruth', 'random') neuron_df = load_table(neuron_df) connection_df = load_table(connection_df) assert {*neuron_df.columns} >= {'bodyId', 'instance', 'type'} assert {*connection_df.columns} >= {'bodyId_pre', 'bodyId_post', 'weight'} if special_debug: # Choose a very small subset of the data neuron_df = neuron_df.iloc[::100] bodies = neuron_df['bodyId'] connection_df = connection_df.query('bodyId_pre in @bodies and bodyId_post in @bodies') if init == "groundtruth": with Timer("Computing initial hierarchy from groundtruth", logger): assign_morpho_indexes(neuron_df) num_morpho_groups = neuron_df.morpho_index.max()+1 init_bs = [neuron_df['morpho_index'].values, np.zeros(num_morpho_groups, dtype=int)] else: init_bs = None # If this is a per-ROI table, sum up the ROIs. if 'roi' in connection_df: connection_df = connection_df.groupby(['bodyId_pre', 'bodyId_post'], as_index=False)['weight'].sum() strong_connections_df = connection_df.query('weight >= @min_weight') strong_bodies = pd.unique(strong_connections_df[['bodyId_pre', 'bodyId_post']].values.reshape(-1)) weights = strong_connections_df.set_index(['bodyId_pre', 'bodyId_post'])['weight'] logger.info(f"Strong connectome (cutoff={min_weight}) has {len(strong_bodies)} bodies and {len(weights)} edges") vertexes = np.arange(len(strong_bodies), dtype=np.uint32) vertex_mapper = LabelMapper(strong_bodies.astype(np.uint64), vertexes) vertex_reverse_mapper = LabelMapper(vertexes, strong_bodies.astype(np.uint64)) g = construct_graph(weights, vertexes, vertex_mapper) with Timer("Running inference"): # Computes a NestedBlockState nbs = graph_tool.inference.minimize_nested_blockmodel_dl(g, bs=init_bs, mcmc_args=dict(parallel=True), # see graph-tool docs and mailing list for caveats deg_corr=True, verbose=verbose) partition_df = construct_partition_table(nbs, neuron_df, vertexes, vertex_reverse_mapper) return strong_connections_df, g, nbs, partition_df
def execute(self): input_wall = self.init_brickwall() block_shape = 3*[self.config["input"]["geometry"]["block-width"]] def compute_stats(brick): return block_stats_for_volume(block_shape, brick.volume, brick.physical_box) with Timer("Computing block stats", logger): all_stats = input_wall.bricks.map(compute_stats).compute() with Timer("Concatenating block stats", logger): stats_df = pd.concat(all_stats, ignore_index=True) with Timer("Writing block stats", logger): self.write_block_stats(stats_df)
def main(): configure_default_logging() parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--output", "-o", type=str, required=False) parser.add_argument('dvid_server') parser.add_argument('uuid') parser.add_argument('labelmap_instance') parser.add_argument('assignment_json') args = parser.parse_args() if args.output is None: name, ext = os.path.splitext(args.assignment_json) args.output = name + '-adjusted' + ext instance_info = (args.dvid_server, args.uuid, args.labelmap_instance) with Timer(f"Processing {args.assignment_json}", logger): with open(args.assignment_json, 'r') as f: assignment_data = ujson.load(f) new_assignment_data = adjust_focused_points(*instance_info, assignment_data) with open(args.output, 'w') as f: ujson.dump(new_assignment_data, f, indent=2) logger.info(f"Wrote to {args.output}")
def _fetch_mito_mask(mito_src, body_mask, mask_box, body_block_corners, scale, mito_min_size, mito_scale_offset): assert scale - mito_scale_offset >= 0, \ "FIXME: need to upsample the mito seg if using scale 0. Not implemented yet." with Timer("Fetching mito mask", logger): if _have_flyemflows and isinstance(mito_src, VolumeService): mito_seg = mito_src.get_subvolume(mask_box, scale) else: assert len(mito_src) == 3 and all( isinstance(s, str) for s in mito_src) mito_seg = fetch_labelmap_specificblocks(*mito_src, body_block_corners, scale - mito_scale_offset, supervoxels=True, threads=4) # mito classes 1,2,3 are valid; # mito mask class 4 means "empty", as does 0. mito_mask = np.array([0, 1, 1, 1, 0], np.uint8)[mito_seg] body_mito_mask = np.where(body_mask, mito_mask, 0) body_mito_mask = vigra.taggedView(body_mito_mask, 'zyx') body_mito_cc = labelMultiArrayWithBackground(body_mito_mask) # Erase small mitos from body_mito_mask mito_sizes = np.bincount(body_mito_cc.reshape(-1)) mito_sizes[0] = 0 body_mito_mask = (mito_sizes > mito_min_size)[body_mito_cc] return body_mito_mask
def persist_and_execute(bag, description=None, logger=None, optimize_graph=True): """ Persist and execute the given dask.Bag. The persisted Bag is returned. """ assert isinstance(bag, Bag) if logger and description: logger.info(f"{description}...") with Timer() as timer: bag = bag.persist(optimize_graph=optimize_graph) count = bag.count().compute() # force eval parts = bag.npartitions partition_counts = bag.map_partitions( lambda part: [sum(1 for _ in part)]).compute() histogram = defaultdict(lambda: 0) for c in partition_counts: histogram[c] += 1 histogram = dict(histogram) if logger and description: logger.info( f"{description} (N={count}, P={parts}, P_hist={histogram}) took {timer.timedelta}" ) return bag
def execute(self): self._init_services() self._sanitize_config() self._init_stats_file() options = self.config["masksegmentation"] min_scale = options["min-pyramid-scale"] max_scale = options["max-pyramid-scale"] resumed_scale = options["resume-at"]["scale"] resumed_batch = options["resume-at"]["batch-index"] if resumed_scale != -1 or resumed_batch != 0: logger.info( f"Resuming at scale {resumed_scale} batch {resumed_batch}") if resumed_scale == -1: starting_scale = max_scale else: starting_scale = resumed_scale mask_s5, mask_box_s5 = self._init_mask() # Process in reverse-order, since it's convenient to check the # low-res scales while the higher ones are still processing. starting_batch = max(0, resumed_batch) for scale in range(starting_scale, min_scale - 1, -1): if scale != starting_scale: starting_batch = 0 with Timer(f"Scale {scale}: Processing", logger): self._execute_scale(scale, starting_batch, mask_s5, mask_box_s5)
def run(self, kill_cluster=True): """ Run the workflow by calling the subclass's execute() function (with some startup/shutdown steps before/after). """ logger.info(f"Working dir: {os.getcwd()}") # The execute() function is run within these nested contexts. # See contexts.py workflow_name = self.config['workflow-name'] cluster_type = self.config["cluster-type"] max_wait = self.config["cluster-max-wait"] # If you're trying to debug a C++ Python extension with AddressSanitizer, # uncomment this function call. # See developer-examples/ASAN_NOTES.txt for details. # self._preload_asan_mac() with \ Timer(f"Running {workflow_name} with {self.num_workers} workers", logger), \ LocalResourceManager(self.config["resource-manager"]), \ ClusterContext(cluster_type, self.num_workers, True, max_wait, not kill_cluster) as self.cc, \ environment_context(self.config["environment-variables"], self), \ WorkerDaemons(self): self.execute()
def filter_groups_for_min_boi_count(edges_df, bois, group_columns=['group_cc'], min_boi_count=2): """ Group the given dataframe according to the columns listed in `group_columns`, and count how many BOIs exist in each group. Then drop rows from the original dataframe if the group they belong to didn't have enough BOIs. """ with Timer("Filtering out groups with too few BOIs", logger): bois = np.fromiter(bois, np.uint64) bois.sort() assert isinstance(group_columns, (list, tuple)) boi_counts_df = edges_df[['label_a', 'label_b', *group_columns]].copy() boi_counts_df['is_boi_a'] = boi_counts_df.eval('label_a in @bois') boi_counts_df['is_boi_b'] = boi_counts_df.eval('label_b in @bois') boi_counts_df['boi_count'] = boi_counts_df['is_boi_a'].astype( int) + boi_counts_df['is_boi_b'].astype(int) group_boi_counts = boi_counts_df.groupby( group_columns)['boi_count'].agg('sum') group_boi_counts = group_boi_counts[group_boi_counts >= min_boi_count] kept_groups_df = group_boi_counts.reset_index()[[*group_columns]] logger.info( f"Keeping {len(kept_groups_df)} groups ({group_columns}) out of {len(boi_counts_df)}" ) edges_df = edges_df.merge(kept_groups_df, 'inner', on=group_columns) return edges_df
def append_group_ccs(edges_df, subset_groups, max_distance=None): """ For the given edges_df, assign a group to each edge (duplicating edges if they belong to multiple groups), and return the cc id as a new column 'group_cc'. The CC operation is performed on all groups at once, using disjoint sets of node IDs for every group. Thus, the CC ids for each group do NOT start at 1. Rather, the values in group_cc are arbitrary and not even consecutive. max_distance: If provided, exclude edges that exceed this distance from the CC computation (but include them in the resulting dataframe). For such excluded edges, group_cc == -1. """ with Timer("Computing group_cc", logger): edges_df = append_group_col(edges_df, subset_groups) # Assign a unique id for every label/group combination, # so we can run CC on the whole set at once. # Labels that appear more than once (in different groups) # will be treated as independent nodes, # and there will be no edges between groups. # # Note: Assigning node IDs this way assumes subset-requirement == 2 subset_groups = subset_groups[['label', 'group']].copy() subset_groups['node_id'] = subset_groups.index.astype(np.uint32) # Append columns for [node_id_a, node_id_b] edges_df = (edges_df.merge( subset_groups, 'left', left_on=['label_a', 'group'], right_on=['label', 'group']) .drop('label', axis=1)) edges_df = (edges_df.merge( subset_groups, 'left', left_on=['label_b', 'group'], right_on=['label', 'group'], suffixes=['_a', '_b']) .drop('label', axis=1)) # Drop edges that are too distant to consider for CC if max_distance is None: thresholded_edges = edges_df[['node_id_a', 'node_id_b']].values else: thresholded_edges = edges_df.query('distance <= @max_distance')[['node_id_a', 'node_id_b']].values # Compute CC on the entire edge set, yielding a unique id for every CC in each group group_cc = 1 + connected_components_nonconsecutive(thresholded_edges, subset_groups['node_id'].values) subset_groups['group_cc'] = group_cc.astype(np.int32) # Append group_cc to every row. # All edges we actually used will have the same group_cc for node_id_a/node_id_b, # so just use node_id_a as the lookup. edges_df = edges_df.merge(subset_groups[['node_id', 'group_cc']], 'left', left_on='node_id_a', right_on='node_id') edges_df = edges_df.drop(['node_id_a', 'node_id_b', 'node_id'], axis=1) # But edges that were NOT used might be part of two different components. # group_cc has no valid value for those rows. Set to -1. edges_df['group_cc'] = edges_df['group_cc'].astype(np.int32) edges_df.loc[edges_df['distance'] > max_distance, 'group_cc'] = np.int32(-1) return edges_df, subset_groups
def _execute_scale(self, scale, starting_batch, mask_s5, mask_box_s5): options = self.config["masksegmentation"] block_width = self.output_service.block_width def scale_box(box, scale): # Scale down, then round up to the nearest multiple of the block width box = np.ceil(box / 2**scale).astype(np.int32) return round_box(box, block_width) # bounding box of the segmentation at the current scale. bounding_box = scale_box(self.input_service.bounding_box_zyx, scale) # Don't make bricks that are wider than the bounding box at this scale brick_shape = np.minimum(self.input_service.preferred_message_shape, bounding_box[1]) assert not (brick_shape % block_width).any() brick_boxes = boxes_from_grid(bounding_box, brick_shape, clipped=True) with Timer(f"Scale {scale}: Preparing bricks", logger): boxes_and_masks = [] for box in brick_boxes: mask_block_box = ((box // 2**(5 - scale)) - mask_box_s5[0]) mask_block_box = mask_block_box.astype( np.int32) # necessary when scale is > 5 mask_block_s5 = np.zeros(box_shape(mask_block_box), bool) mask_block_s5 = extract_subvol(mask_s5, mask_block_box) if mask_block_s5.any(): boxes_and_masks.append((box, mask_block_s5)) batches = [*iter_batches(boxes_and_masks, options["batch-size"])] if starting_batch == 0: logger.info(f"Scale {scale}: Processing {len(batches)} batches") else: logger.info( f"Scale {scale}: Processing {len(batches) - starting_batch} " f"remaining batches from {len(batches)} original batches") assert starting_batch < len(batches), \ f"Can't start at batch {starting_batch}; there are only {len(batches)} in total." batches = batches[starting_batch:] for batch_index, batch_boxes_and_masks in enumerate( batches, start=starting_batch): with Timer(f"Scale {scale}: Batch {batch_index:02d}", logger): self._execute_batch(scale, batch_index, batch_boxes_and_masks)
def sort_block_stats(block_sv_stats, segment_to_body_df=None, output_path=None, agglo_mapping_path=None): """ Sorts the block stats by body ID, IN-PLACE. If segment_to_body_df is given, the body_id column is overwritten with mapped IDs. If agglo_mapping_path and output_path are given, save the sorted result to an hdf5 file. block_sv_stats: numpy structured array of blockwise supervoxel counts, with dtype: ['body_id', 'segment_id', 'z', 'y', 'x', 'count']. segment_to_body_df: If loading an agglomeration, must be a 2-column DataFrame, mapping supervoxel-to-body. If loading unagglomerated supervoxels, set to None (identity mapping is used). output_path: If given, sorted result will be saved as hdf5 to this file, with the internal dataset name 'stats' agglo_mapping_path: A path indicating where the segment_to_body_df was loaded from. It's saved to the hdf5 attributes for provenance tracking. """ with Timer("Assigning body IDs", logger): _overwrite_body_id_column(block_sv_stats, segment_to_body_df) with Timer(f"Sorting {len(block_sv_stats)} block stats", logger): block_sv_stats.sort( order=['body_id', 'z', 'y', 'x', 'segment_id', 'count']) if output_path: with Timer(f"Saving sorted stats to {output_path}"), h5py.File( output_path, 'w') as f: f.create_dataset('stats', data=block_sv_stats, chunks=True) if segment_to_body_df is None: f['stats'].attrs['presorted-by'] = 'segment_id' else: assert agglo_mapping_path f['stats'].attrs['presorted-by'] = 'body_id' f['stats'].attrs[ 'agglomeration-mapping-path'] = agglo_mapping_path
def init_boxes(self, volume_service, subset_labels, roi): sbm = None if roi: base_service = volume_service.base_service assert isinstance(base_service, DvidVolumeService), \ "Can't specify an ROI unless you're using a dvid input" assert isinstance(volume_service, (ScaledVolumeService, DvidVolumeService)), \ "The 'roi' option doesn't support adapters other than 'rescale-level'" scale = 0 if isinstance(volume_service, ScaledVolumeService): scale = volume_service.scale_delta assert scale <= 5, \ "The 'roi' option doesn't support volumes downscaled beyond level 5" server, uuid, _seg_instance = base_service.instance_triple brick_shape = volume_service.preferred_message_shape assert not (brick_shape % 2**(5-scale)).any(), \ "If using an ROI, select a brick shape that is divisible by 32" seg_box = volume_service.bounding_box_zyx seg_box = round_box(seg_box, brick_shape) seg_box_s0 = seg_box * 2**scale seg_box_s5 = seg_box // 2**(5 - scale) with Timer( f"Fetching mask for ROI '{roi}' ({seg_box_s0[:, ::-1].tolist()})", logger): roi_mask_s5, _ = fetch_roi(server, uuid, roi, format='mask', mask_box=seg_box_s5) # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0. sbm = SparseBlockMask.create_from_highres_mask( roi_mask_s5, 2**(5 - scale), seg_box, brick_shape) elif subset_labels: try: sbm = volume_service.sparse_block_mask_for_labels( [*subset_labels]) if ((sbm.box[1] - sbm.box[0]) == 0).any(): raise RuntimeError( "Could not find sparse masks for any of the subset-labels" ) except NotImplementedError: sbm = None if sbm is None: boxes = boxes_from_grid(volume_service.bounding_box_zyx, volume_service.preferred_message_shape, clipped=True) return np.array([*boxes]) else: return sbm.sparse_boxes(brick_shape)
def main(): configure_default_logging() initialize_excepthook() logger.setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--last-mutid', '-i', required=False, type=int) parser.add_argument( '--num-threads', '-t', default=0, type=int, help= 'How many threads to use when ingesting label indexes (does not currently apply to mappings)' ) parser.add_argument( '--num-processes', '-p', default=0, type=int, help= 'How many processes to use when ingesting label indexes (does not currently apply to mappings)' ) parser.add_argument( '--batch-size', '-b', default=100_000, type=int, help= 'Data is grouped in batches to the server. This is the batch size, as measured in ROWS of data to be processed for each batch.' ) parser.add_argument('server') parser.add_argument('src_uuid') parser.add_argument('dest_uuid') parser.add_argument('labelmap_instance') parser.add_argument( 'supervoxel_block_stats_h5', nargs='?', # not required if only ingesting mapping help= f'An HDF5 file with a single dataset "stats", with dtype: {STATS_DTYPE[1:]} (Note: No column for body_id)' ) args = parser.parse_args() with Timer() as timer: src_info = (args.server, args.src_uuid, args.labelmap_instance) dest_info = (args.server, args.dest_uuid, args.labelmap_instance) erase_from_labelindexes(src_info, dest_info, args.supervoxel_block_stats_h5, args.batch_size, threads=args.num_threads, processes=args.num_processes, last_mutid=args.last_mutid) logger.info(f"DONE. Total time: {timer.timedelta}")
def write_stats(stats_df, output_path, logger=None): if not output_path.endswith('.pkl.xz'): output_path += '.pkl.xz' if logger is None: logger = logging.getLogger(__name__) stats_bytes = stats_df.memory_usage().sum() stats_gb = stats_bytes / 1e9 with Timer(f"Saving segment statistics", logger): logger.info(f"Writing stats ({stats_gb:.3f} GB) to {output_path}") stats_df.to_pickle(output_path)
def init_boxes(self, volume_service, roi): if not roi["name"]: boxes = boxes_from_grid(volume_service.bounding_box_zyx, volume_service.preferred_message_shape, clipped=True) return np.array([*boxes]) base_service = volume_service.base_service if not roi["server"] or not roi["uuid"]: assert isinstance(base_service, DvidVolumeService), \ "Since you aren't using a DVID input source, you must specify the ROI server and uuid." roi["server"] = (roi["server"] or volume_service.server) roi["uuid"] = (roi["uuid"] or volume_service.uuid) if roi["scale"] is not None: scale = roi["scale"] elif isinstance(volume_service, ScaledVolumeService): scale = volume_service.scale_delta assert scale <= 5, \ "The 'roi' option doesn't support volumes downscaled beyond level 5" else: scale = 0 brick_shape = volume_service.preferred_message_shape assert not (brick_shape % 2**(5-scale)).any(), \ "If using an ROI, select a brick shape that is divisible by 32" seg_box = volume_service.bounding_box_zyx seg_box = round_box(seg_box, 2**(5 - scale)) seg_box_s0 = seg_box * 2**scale seg_box_s5 = seg_box // 2**(5 - scale) with Timer( f"Fetching mask for ROI '{roi['name']}' ({seg_box_s0[:, ::-1].tolist()})", logger): roi_mask_s5, _ = fetch_roi(roi["server"], roi["uuid"], roi["name"], format='mask', mask_box=seg_box_s5) # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0. sbm = SparseBlockMask(roi_mask_s5, seg_box, 2**(5 - scale)) boxes = sbm.sparse_boxes(brick_shape) # Clip boxes to the true (not rounded) bounding box boxes[:, 0] = np.maximum(boxes[:, 0], volume_service.bounding_box_zyx[0]) boxes[:, 1] = np.minimum(boxes[:, 1], volume_service.bounding_box_zyx[1]) return boxes
def __init__(self, graph, nbs, partition_df, neuron_df, strong_connections_df): with Timer("Loading data", logger): graph = load_pickle(graph) nbs = load_pickle(nbs) partition_df = load_table(partition_df) neuron_df = load_table(neuron_df) strong_connections_df = load_table(strong_connections_df) num_levels = len(nbs.get_bs()) + 1 assert {*partition_df.columns} > {*range(num_levels)} assert num_levels not in partition_df, \ "partition_df does not match NestedBlockState levels" self.graph = graph self.nbs = nbs self.num_levels = num_levels self.partition_df = partition_df self.neuron_df = neuron_df self.strong_connections_df = strong_connections_df with Timer("Initialzing browser", logger): self._initialize()
def init_boxes(self, volume_service, roi): if not roi: boxes = boxes_from_grid(volume_service.bounding_box_zyx, volume_service.preferred_message_shape, clipped=True) return np.array([*boxes]) base_service = volume_service.base_service assert isinstance(base_service, DvidVolumeService), \ "Can't specify an ROI unless you're using a dvid input" assert isinstance(volume_service, (ScaledVolumeService, DvidVolumeService)), \ "The 'roi' option doesn't support adapters other than 'rescale-level'" scale = 0 if isinstance(volume_service, ScaledVolumeService): scale = volume_service.scale_delta assert scale <= 5, \ "The 'roi' option doesn't support volumes downscaled beyond level 5" server, uuid, _seg_instance = base_service.instance_triple brick_shape = volume_service.preferred_message_shape assert not (brick_shape % 2**(5-scale)).any(), \ "If using an ROI, select a brick shape that is divisible by 32" seg_box = volume_service.bounding_box_zyx seg_box = round_box(seg_box, 2**(5 - scale)) seg_box_s0 = seg_box * 2**scale seg_box_s5 = seg_box // 2**(5 - scale) with Timer( f"Fetching mask for ROI '{roi}' ({seg_box_s0[:, ::-1].tolist()})", logger): roi_mask_s5, _ = fetch_roi(server, uuid, roi, format='mask', mask_box=seg_box_s5) # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0. sbm = SparseBlockMask(roi_mask_s5, seg_box, 2**(5 - scale)) boxes = sbm.sparse_boxes(brick_shape) # Clip boxes to the true (not rounded) bounding box boxes[:, 0] = np.maximum(boxes[:, 0], volume_service.bounding_box_zyx[0]) boxes[:, 1] = np.minimum(boxes[:, 1], volume_service.bounding_box_zyx[1]) return boxes
def execute(self): self._init_services() self._init_masks() self._log_neuroglancer_links() self._sanitize_config() # Aim for 2 GB RDD partitions when loading segmentation GB = 2**30 self.target_partition_size_voxels = 2 * GB // np.uint64().nbytes # (See note in _init_services() regarding output bounding boxes) input_bb_zyx = self.input_service.bounding_box_zyx output_bb_zyx = self.output_service.bounding_box_zyx self.translation_offset_zyx = output_bb_zyx[0] - input_bb_zyx[0] if self.translation_offset_zyx.any(): logger.info( f"Translation offset is {self.translation_offset_zyx[:, ::-1].tolist()}" ) pyramid_depth = self.config["copysegmentation"]["pyramid-depth"] slab_depth = self.config["copysegmentation"]["slab-depth"] # Process data in Z-slabs output_slab_boxes = list(slabs_from_box(output_bb_zyx, slab_depth)) max_depth = max( map(lambda box: box[1][0] - box[0][0], output_slab_boxes)) logger.info( f"Processing data in {len(output_slab_boxes)} slabs (max depth={max_depth}) for {pyramid_depth} pyramid levels" ) if self.config["copysegmentation"]["compute-block-statistics"]: self._init_stats_file() # Read data and accumulate statistics, one slab at a time. for slab_index, output_slab_box in enumerate(output_slab_boxes): with Timer() as timer: self._process_slab(slab_index, output_slab_box) logger.info( f"Slab {slab_index}: Total processing time: {timer.timedelta}") delay_minutes = self.config["copysegmentation"][ "delay-minutes-between-slabs"] if delay_minutes > 0 and slab_index != len(output_slab_boxes) - 1: logger.info( f"Delaying {delay_minutes} before continuing to next slab..." ) time.sleep(delay_minutes * 60) logger.info(f"DONE copying/downsampling all slabs")
def run(self, kill_cluster=True): """ Run the workflow by calling the subclass's execute() function (with some startup/shutdown steps before/after). """ logger.info(f"Working dir: {os.getcwd()}") # The execute() function is run within these nested contexts. # See contexts.py workflow_name = self.config['workflow-name'] with Timer(f"Running {workflow_name} with {self.num_workers} workers", logger), \ LocalResourceManager(self.config["resource-manager"]), \ WorkflowClusterContext(self, True, not kill_cluster), \ environment_context(self.config["environment-variables"], self), \ WorkerDaemons(self): self.execute()
def init_boxes(self, volume_service, roi, chunk_shape_s0): """ Return a set of bounding boxes to tile the given ROI. Scale 0 of the volume service should correspond to full-res data, which is 32x higher-res than ROI resolution. """ if not roi["name"]: boxes = boxes_from_grid(volume_service.bounding_box_zyx, chunk_shape_s0, clipped=True) return np.array([*boxes]) base_service = volume_service.base_service if not roi["server"] or not roi["uuid"]: assert isinstance(base_service, DvidVolumeService), \ "Since you aren't using a DVID input source, you must specify the ROI server and uuid." roi["server"] = (roi["server"] or volume_service.server) roi["uuid"] = (roi["uuid"] or volume_service.uuid) assert not (chunk_shape_s0 % 2**5).any(), \ "If using an ROI, select a chunk shape that is divisible by 32" seg_box_s0 = volume_service.bounding_box_zyx seg_box_s0 = round_box(seg_box_s0, 2**5) seg_box_s5 = seg_box_s0 // 2**5 with Timer( f"Fetching mask for ROI '{roi['name']}' ({seg_box_s0[:, ::-1].tolist()})", logger): roi_mask_s5, _ = fetch_roi(roi["server"], roi["uuid"], roi["name"], format='mask', mask_box=seg_box_s5) # SBM 'full-res' corresponds to the input service voxels, not necessarily scale-0. sbm = SparseBlockMask(roi_mask_s5, seg_box_s0, 2**5) boxes = sbm.sparse_boxes(chunk_shape_s0) # Clip boxes to the true (not rounded) bounding box boxes[:, 0] = np.maximum(boxes[:, 0], volume_service.bounding_box_zyx[0]) boxes[:, 1] = np.minimum(boxes[:, 1], volume_service.bounding_box_zyx[1]) return boxes
def init_brickwall(self): input_config = self.config["input"] mask_input_config = self.config["mask-input"] mgr_config = self.config["resource-manager"] options = self.config["sparseblockstats"] resource_mgr_client = ResourceManagerClient( mgr_config["server"], mgr_config["port"] ) input_service = VolumeService.create_from_config( input_config, resource_mgr_client ) mask_service = VolumeService.create_from_config( mask_input_config, resource_mgr_client ) assert (input_service.preferred_message_shape == mask_service.preferred_message_shape).all(), \ "This workflow assumes that the input and the mask-input use the same brick grid." assert not (input_service.preferred_message_shape % input_service.block_width).any(), \ "input brick grid spacing must be a multipe of the input's block-width" assert not (mask_service.preferred_message_shape % mask_service.block_width).any(), \ "mask brick grid spacing must be a multipe of the input's block-width" is_supervoxels = False if isinstance(mask_service.base_service, DvidVolumeService): is_supervoxels = mask_service.base_service.supervoxels # Load body list and eliminate duplicates subset_labels = load_body_list(options["mask-labels"], is_supervoxels) subset_labels = set(subset_labels) if not subset_labels: raise RuntimeError("You didn't specify any mask subset labels. " "If you want to compute block stats for an entire segmentation volume, use the CopySegmentation workflow.") sbm = mask_service.sparse_block_mask_for_labels(subset_labels) if ((sbm.box[1] - sbm.box[0]) == 0).any(): raise RuntimeError("Could not find sparse masks for any of the mask-labels") with Timer("Initializing BrickWall", logger): # Aim for 2 GB RDD partitions when loading segmentation GB = 2**30 target_partition_size_voxels = 2 * GB // np.uint64().nbytes brickwall = BrickWall.from_volume_service(input_service, 0, None, self.client, target_partition_size_voxels, 0, sbm) # Pad if necessary to ensure that all fetched bricks are block-aligned block_shape = 3*(input_service.block_width,) brickwall = brickwall.fill_missing(input_service.get_subvolume, Grid(block_shape)) return brickwall
def compute_fragment_edges(edges_df, bois, processes=0): """ For each edge group, search for paths that can connect the BOIs in the group. Each group is a "fragment", a.k.a. "task". Return a new edge DataFrame, where each edge is associated with a group and a fragment within that group, indicated by group_cc and cc_task, respectively. Args: edges_df: An edge table as described in extract_assignment_fragments(), above, with the additional requirement that the table is in "normalized" form, i.e. label_a < label_b. bois: List of BOIs """ fragments = extract_fragments(edges_df, bois, processes) with Timer("Extracting edges for each fragment from full table", logger): edges_df = edges_df.query('group_cc in @fragments.keys()') cc_col = [] task_col = [] frag_cols = [] for group_cc, group_fragments in fragments.items(): for task_index, frag in enumerate(group_fragments): cc_col.extend([group_cc] * (len(frag) - 1)) task_col.extend([task_index] * (len(frag) - 1)) frag_edges = list(zip(frag[:-1], frag[1:])) frag_cols.extend(frag_edges) frag_cols = np.array(frag_cols, dtype=np.uint64) frag_cols.sort(axis=1) fragment_edges_df = pd.DataFrame(frag_cols, columns=['label_a', 'label_b']) fragment_edges_df['group_cc'] = cc_col fragment_edges_df['cc_task'] = task_col fragment_edges_df = fragment_edges_df.merge( edges_df, 'left', ['group_cc', 'label_a', 'label_b']) return fragment_edges_df
def process_and_save(body): tbars = _fetch_synapses(body) valid_mitos = _fetch_mito_ids(body) # TODO: # Does the stdout_redirected() mechanism work correctly in the context of multiprocessing? # If not, I should probably just use a custom logging handler instead. with open(f"body-logs/{body}.log", "w") as f, stdout_redirected(f), Timer() as timer: processed_tbars = [] if len(tbars) == 0: logging.getLogger(__name__).warning( f"Body {body}: No synapses found") if len(valid_mitos) == 0: logging.getLogger(__name__).warning( f"Body {body}: Failed to fetch mito supervoxels") processed_tbars = initialize_results(body, tbars) if len(valid_mitos) and len(tbars): processed_tbars = measure_tbar_mito_distances( body_svc, mito_svc, body, tbars=tbars, valid_mitos=valid_mitos) if len(processed_tbars) > 0: processed_tbars.to_csv(f'{output_dir}/{body}.csv', header=True, index=False) with open(f'{output_dir}/{body}.pkl', 'wb') as f: pickle.dump(processed_tbars, f) if len(tbars) == 0: return (body, 0, 'no-synapses', timer.seconds) if len(valid_mitos) == 0: return (body, len(processed_tbars), 'no-mitos', timer.seconds) return (body, len(tbars), 'success', timer.seconds)
def _fetch_body_mito_seg(mito_src, body_mask, mask_box, scale, valid_mito_mapper, logger): """ Return the mito segmentation for only those mitos which overlap with the given body mask (not elsewhere). Args: mito_src: VolumeService to obtain mito segmentation body_mask: Volume with labels 1+2 as described in _fetch_body_mask() valid_mito_mapper: LabelMapper that keeps only valid mitos when its apply_with_default() method is called. """ with Timer("Fetching mito segmentation", logger): assert _have_flyemflows and isinstance(mito_src, VolumeService) mito_seg = mito_src.get_subvolume(mask_box, scale) if valid_mito_mapper: return valid_mito_mapper.apply_with_default(mito_seg) core_body_mask = (body_mask == 2) body_mito_seg = np.where(core_body_mask, mito_seg, 0) # Due to downsampling discrepancies between the mito seg and neuron seg, # mito from neighboring neurons may slightly overlap this neuron. # Keep only mitos which have more of their voxels in the body mask than not. # # FIXME: # This heuristic fails at the volume edge, where we might see just # part of the mito. # Need to overwrite small mitos on the volume edge with FACE_MARKER # to indicate that they can't be trusted, and if such a mito is # the "winning" mito, then we need to try a different search config. body_mito_sizes = pd.Series(body_mito_seg.ravel()).value_counts() del body_mito_seg mito_sizes = pd.Series(mito_seg.ravel()).value_counts() mito_sizes, body_mito_sizes = mito_sizes.align(body_mito_sizes, fill_value=0) core_mitos = {*mito_sizes[(body_mito_sizes > mito_sizes / 2)].index} - {0} core_mito_seg = apply_mask_for_labels(mito_seg, core_mitos, inplace=True) return core_mito_seg