def run_meshlab_script_on_dir(script_name, in_dir, out_dir, suffix, arg_dict={}, n_threads=1): paths = glob.glob(in_dir + "/*.obj") print(len(paths)) if len(suffix) > 0: suffix = "_{}".format(suffix) n_jobs = n_threads * 3 if len(paths) < n_jobs: n_jobs = len(paths) path_blocks = np.array_split(paths, n_jobs) multi_args = [] for path_block in path_blocks: multi_args.append([script_name, path_block, out_dir, suffix, arg_dict]) if n_threads == 1: mu.multiprocess_func(_run_meshlab_script_on_dir_thread, multi_args, debug=True, verbose=True, n_threads=1) else: mu.multisubprocess_func(_run_meshlab_script_on_dir_thread, multi_args, n_threads=n_threads)
def create_manifests_for_higher_layers(self, n_threads=1): root_id_max = self.cg.get_max_node_id( self.cg.get_chunk_id(layer=np.int(self.cg.n_layers), x=np.int(0), y=np.int(0), z=np.int(0))) root_id_blocks = np.linspace(1, root_id_max, n_threads*3).astype(np.int) cg_info = self.cg.get_serialized_info() del (cg_info['credentials']) multi_args = [] for i_block in range(len(root_id_blocks) - 1): multi_args.append([cg_info, self.cv_path, self.cv_mesh_dir, root_id_blocks[i_block], root_id_blocks[i_block + 1], self.highest_mesh_layer]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(meshgen._create_manifest_files_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(meshgen._create_manifest_files_thread, multi_args, n_threads=n_threads)
def create_atomic_chunks(im, aff_dtype=np.float32, n_threads=1): """ Creates all atomic chunks :param im: IngestionManager :param aff_dtype: np.dtype affinity datatype (np.float32 or np.float64) :param n_threads: int number of threads to use :return: """ im_info = im.get_serialized_info() multi_args = [] # Randomize chunk order chunk_coords = list(im.chunk_coord_gen) # np.random.shuffle(chunk_coords) for i_chunk_coord, chunk_coord in enumerate(chunk_coords): multi_args.append([ im_info, chunk_coord, aff_dtype, i_chunk_coord, len(chunk_coords) ]) if n_threads == 1: mu.multiprocess_func(_create_atomic_chunk, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_create_atomic_chunk, multi_args, n_threads=n_threads)
def rewrite_segmentation(dataset_name, n_threads=64, n_units_per_thread=None): if dataset_name == "pinky": cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" from_url = "gs://neuroglancer/pinky40_v11/watershed/" to_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/" elif dataset_name == "basil": cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" from_url = "gs://neuroglancer/ranl/basil_4k_oldnet/ws/" to_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/" else: raise Exception("Dataset unknown") file_paths = np.sort(glob.glob(creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_url)) + "/*rg2cg*")) if n_units_per_thread is None: file_path_blocks = np.array_split(file_paths, n_threads*3) else: n_blocks = int(np.ceil(len(file_paths) / n_units_per_thread)) file_path_blocks = np.array_split(file_paths, n_blocks) multi_args = [] for fp_block in file_path_blocks: multi_args.append([fp_block, from_url, to_url]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_rewrite_segmentation_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_rewrite_segmentation_thread, multi_args, n_threads=n_threads)
def download_meshes(seg_ids, target_dir, cv_path, n_threads=1): """ Downloads meshes in target directory (parallel) :param seg_ids: list of ints :param target_dir: str :param cv_path: str :param n_threads: int """ n_jobs = n_threads * 3 if len(seg_ids) < n_jobs: n_jobs = len(seg_ids) seg_id_blocks = np.array_split(seg_ids, n_jobs) multi_args = [] for seg_id_block in seg_id_blocks: multi_args.append([seg_id_block, cv_path, target_dir]) if n_jobs == 1: mu.multiprocess_func(_download_meshes_thread, multi_args, debug=True, verbose=True, n_threads=1) else: mu.multisubprocess_func(_download_meshes_thread, multi_args, n_threads=n_threads)
def download_meshes(seg_ids, target_dir, cv_path, overwrite=True, n_threads=1, verbose=False, merge_large_components=True, remove_duplicate_vertices=True, map_gs_to_https=True, fmt="hdf5"): """ Downloads meshes in target directory (in parallel) :param seg_ids: list of uint64s :param target_dir: str :param cv_path: str :param overwrite: bool :param n_threads: int :param verbose: bool :param merge_large_components: bool :param remove_duplicate_vertices: bool :param fmt: str "h5" is highly recommended """ if n_threads > 1: n_jobs = n_threads * 3 else: n_jobs = 1 if len(seg_ids) < n_jobs: n_jobs = len(seg_ids) seg_id_blocks = np.array_split(seg_ids, n_jobs) multi_args = [] for seg_id_block in seg_id_blocks: multi_args.append([ seg_id_block, cv_path, target_dir, fmt, overwrite, merge_large_components, remove_duplicate_vertices, map_gs_to_https ]) if n_jobs == 1: mu.multiprocess_func(_download_meshes_thread, multi_args, debug=True, verbose=verbose, n_threads=n_threads) else: mu.multisubprocess_func(_download_meshes_thread, multi_args, n_threads=n_threads, package_name="meshparty", n_retries=40)
def rechunk_dataset(dataset_name, block_size=(1024, 1024, 64), n_threads=64, mip=0): if dataset_name == "pinky40em": from_url = "gs://neuroglancer/pinky40_v11/image_rechunked/" to_url = "gs://neuroglancer/svenmd/pinky40_v11/image_512_512_32/" elif dataset_name == "pinky100seg": from_url = "gs://neuroglancer/nkem/pinky100_v0/ws/lost_no-random/bbox1_0/" to_url = "gs://neuroglancer/svenmd/pinky100_v0/ws/lost_no-random/bbox1_0_64_64_16/" elif dataset_name == "basil": raise () else: raise Exception("Dataset unknown") from_cv = cloudvolume.CloudVolume(from_url, mip=mip) dataset_bounds = np.array(from_cv.bounds.to_list()) block_size = np.array(list(block_size)) super_block_size = block_size * 2 coordinate_iter = itertools.product( np.arange(dataset_bounds[0], dataset_bounds[3], super_block_size[0]), np.arange(dataset_bounds[1], dataset_bounds[4], super_block_size[1]), np.arange(dataset_bounds[2], dataset_bounds[5], super_block_size[2])) coordinates = np.array(list(coordinate_iter)) multi_args = [] for coordinate in coordinates: end_coordinate = coordinate + super_block_size m = end_coordinate > dataset_bounds[3:] end_coordinate[m] = dataset_bounds[3:][m] multi_args.append( [coordinate, end_coordinate, block_size, from_url, to_url, mip]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_rewrite_image_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_rewrite_image_thread, multi_args, n_threads=n_threads)
def create_layer(im, layer_id, n_threads=1): """ Creates abstract layer of chunkedgraph Abstract layers have to be build in sequence. Abstract layers are all layers above the first layer (1). `create_atomic_chunks` creates layer 2 as well. Hence, this function is responsible for every creating layers > 2. :param im: IngestionManager :param layer_id: int > 2 :param n_threads: int number of threads to use :return: """ assert layer_id > 2 child_chunk_coords = im.chunk_coords // im.cg.fan_out**(layer_id - 3) child_chunk_coords = child_chunk_coords.astype(np.int) child_chunk_coords = np.unique(child_chunk_coords, axis=0) parent_chunk_coords = child_chunk_coords // im.cg.fan_out parent_chunk_coords = parent_chunk_coords.astype(np.int) parent_chunk_coords, inds = np.unique(parent_chunk_coords, axis=0, return_inverse=True) im_info = im.get_serialized_info() multi_args = [] # Randomize chunks order = np.arange(len(parent_chunk_coords), dtype=np.int) np.random.shuffle(order) for i_chunk, idx in enumerate(order): multi_args.append([ im_info, layer_id, child_chunk_coords[inds == idx], i_chunk, len(order) ]) if n_threads == 1: mu.multiprocess_func(_create_layer, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_create_layer, multi_args, n_threads=n_threads)
def get_delta_roots(cg, time_stamp_start: datetime.datetime, time_stamp_end: Optional[datetime.datetime] = None, min_seg_id: int = 1, n_threads: int = 1) -> Sequence[np.uint64]: # Create filters: time and id range max_seg_id = cg.get_max_seg_id(cg.root_chunk_id) + 1 n_blocks = int(np.min([n_threads + 1, max_seg_id - min_seg_id + 1])) seg_id_blocks = np.linspace(min_seg_id, max_seg_id, n_blocks, dtype=np.uint64) cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] multi_args = [] for i_id_block in range(0, len(seg_id_blocks) - 1): multi_args.append([ seg_id_blocks[i_id_block], seg_id_blocks[i_id_block + 1], cg_serialized_info, time_stamp_start, time_stamp_end ]) # Run parallelizing if n_threads == 1: results = mu.multiprocess_func(_read_delta_root_rows_thread, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_read_delta_root_rows_thread, multi_args, n_threads=n_threads) # aggregate all the results together new_root_ids = [] expired_root_id_candidates = [] for r1, r2 in results: new_root_ids.extend(r1) expired_root_id_candidates.extend(r2) expired_root_id_candidates = np.array(expired_root_id_candidates, dtype=np.uint64) # filter for uniqueness expired_root_id_candidates = np.unique(expired_root_id_candidates) # filter out the expired root id's whose creation (measured by the timestamp # of their Child links) is after the time_stamp_start rows = cg.read_node_id_rows(node_ids=expired_root_id_candidates, columns=[column_keys.Hierarchy.Child], end_time=time_stamp_start) expired_root_ids = np.array([k for (k, v) in rows.items()], dtype=np.uint64) return np.array(new_root_ids, dtype=np.uint64), expired_root_ids
def download_and_store_cv_files(dataset_name="basil", n_threads=10, olduint32=False): """ Downloads files from google cloud using cloud-volume :param dataset_name: str :param n_threads: int :param olduint32: bool """ if "basil" == dataset_name: cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" elif "pinky40" == dataset_name: cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" elif "pinky100" == dataset_name: cv_url = "gs://nkem/pinky100_v0/region_graph/" else: raise Exception("Could not identify region graph ressource") with storage.SimpleStorage(cv_url) as cv_st: dir_path = creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_st.layer_path)) if not os.path.exists(dir_path): os.makedirs(dir_path) file_paths = list(cv_st.list_files()) file_chunks = np.array_split(file_paths, n_threads * 3) multi_args = [] for i_file_chunk, file_chunk in enumerate(file_chunks): multi_args.append([i_file_chunk, cv_url, file_chunk, olduint32]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_download_and_store_cv_files_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_download_and_store_cv_files_thread, multi_args, n_threads=n_threads)
def get_merge_candidates(table_id, save_dir=f"{HOME}/benchmarks/", n_threads=1): cg = chunkedgraph.ChunkedGraph(table_id) bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T bounds -= bounds[:, 0:1] chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int) chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds]) chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int) order = np.arange(len(chunk_coords)) np.random.shuffle(order) n_blocks = np.min([len(order), n_threads * 3]) blocks = np.array_split(order, n_blocks) cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, chunk_coords[block]]) if n_threads == 1: results = mu.multiprocess_func(_get_merge_candidates, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_get_merge_candidates, multi_args, n_threads=n_threads) merge_edges = [] merge_edge_weights = [] for result in results: merge_edges.extend(result[0]) merge_edge_weights.extend(result[1]) save_folder = f"{save_dir}/{table_id}/" if not os.path.exists(save_folder): os.makedirs(save_folder) with h5py.File(f"{save_folder}/merge_edge_stats.h5", "w") as f: f.create_dataset("merge_edges", data=merge_edges, compression="gzip") f.create_dataset("merge_edge_weights", data=merge_edge_weights, compression="gzip")
def mesh_lvl2_previews(cg, lvl2_node_ids, cv_path=None, cv_mesh_dir=None, mip=2, simplification_factor=999999, max_err=40, parallel_download=8, verbose=True, cache_control="no-cache", n_threads=1): serialized_cg_info = cg.get_serialized_info() del serialized_cg_info["credentials"] if not isinstance(lvl2_node_ids, dict): lvl2_node_ids = dict(zip(lvl2_node_ids, [None] * len(lvl2_node_ids))) mesh_dir = cv_mesh_dir or cg._mesh_dir multi_args = [] for lvl2_node_id in lvl2_node_ids.keys(): multi_args.append([ serialized_cg_info, lvl2_node_id, lvl2_node_ids[lvl2_node_id], cv_path, mesh_dir, mip, simplification_factor, max_err, parallel_download, verbose, cache_control ]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_mesh_lvl2_previews_threads, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: mu.multisubprocess_func(_mesh_lvl2_previews_threads, multi_args, n_threads=n_threads)
def count_nodes_and_edges(table_id, n_threads=1): cg = chunkedgraph.ChunkedGraph(table_id) bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T bounds -= bounds[:, 0:1] chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int) chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds]) chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int) order = np.arange(len(chunk_coords)) np.random.shuffle(order) n_blocks = np.min([len(order), n_threads * 3]) blocks = np.array_split(order, n_blocks) cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, chunk_coords[block]]) if n_threads == 1: results = mu.multiprocess_func(_count_nodes_and_edges, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_count_nodes_and_edges, multi_args, n_threads=n_threads) n_edges_per_chunk = [] n_nodes_per_chunk = [] for result in results: n_nodes_per_chunk.extend(result[0]) n_edges_per_chunk.extend(result[1]) return n_nodes_per_chunk, n_edges_per_chunk
def get_latest_roots(cg, time_stamp: Optional[datetime.datetime] = None, n_threads: int = 1) -> Sequence[np.uint64]: # Create filters: time and id range max_seg_id = cg.get_max_seg_id(cg.root_chunk_id) + 1 if n_threads == 1: n_blocks = 1 else: n_blocks = int(np.min([n_threads * 3 + 1, max_seg_id])) seg_id_blocks = np.linspace(1, max_seg_id, n_blocks + 1, dtype=np.uint64) cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] multi_args = [] for i_id_block in range(0, len(seg_id_blocks) - 1): multi_args.append([ seg_id_blocks[i_id_block], seg_id_blocks[i_id_block + 1], cg_serialized_info, time_stamp ]) if n_threads == 1: results = mu.multiprocess_func(_read_root_rows_thread, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_read_root_rows_thread, multi_args, n_threads=n_threads) root_ids = [] for result in results: root_ids.extend(result) return np.array(root_ids, dtype=np.uint64)
def family_consistency_test(table_id, n_threads=64): """ Runs a simple test on the WHOLE graph tests: id in children(parent(id)) :param table_id: str :param n_threads: int :return: dict n x 2 per layer each failed pair: (node_id, parent_id) """ cg = chunkedgraph.ChunkedGraph(table_id) failed_node_id_dict = {} for layer_id in range(1, cg.n_layers): print("\n\n Layer %d \n\n" % layer_id) step = int(cg.fan_out ** np.max([0, layer_id - 2])) coords = list(itertools.product(range(0, 8, step), range(0, 8, step), range(0, 4, step))) multi_args = [] for coord in coords: multi_args.append([table_id, coord, layer_id]) collected_failed_node_ids = mu.multisubprocess_func( _family_consistency_test_thread, multi_args, n_threads=n_threads) failed_node_ids = [] for _failed_node_ids in collected_failed_node_ids: failed_node_ids.extend(_failed_node_ids) failed_node_id_dict[layer_id] = np.array(failed_node_ids) print("\n%d nodes rows failed\n" % len(failed_node_ids)) return failed_node_id_dict
def write_flat_segmentation(cg, dataset_name, bounding_box=None, block_factor=2, n_threads=1, mip=0): """ Applies the mapping in the chunkedgraph to the supervoxels to create a flattened segmentation :param cg: chunkedgraph instance :param dataset_name: str :param bounding_box: np.array :param block_factor: int :param n_threads: int :param mip: int :return: bool """ if dataset_name == "pinky": from_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/" to_url = "gs://neuroglancer/svenmd/pinky40_v11/segmentation/" elif dataset_name == "basil": from_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/" to_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/segmentation/" else: raise Exception("Dataset unknown") from_cv = cloudvolume.CloudVolume(from_url, mip=mip) dataset_bounding_box = np.array(from_cv.bounds.to_list()) block_bounding_box_cg = \ [np.floor(dataset_bounding_box[:3] / cg.chunk_size).astype(np.int), np.ceil(dataset_bounding_box[3:] / cg.chunk_size).astype(np.int)] if bounding_box is not None: bounding_box_cg = \ [np.floor(bounding_box[0] / cg.chunk_size).astype(np.int), np.ceil(bounding_box[1] / cg.chunk_size).astype(np.int)] m = block_bounding_box_cg[0] < bounding_box_cg[0] block_bounding_box_cg[0][m] = bounding_box_cg[0][m] m = block_bounding_box_cg[1] > bounding_box_cg[1] block_bounding_box_cg[1][m] = bounding_box_cg[1][m] block_iter = itertools.product( np.arange(block_bounding_box_cg[0][0], block_bounding_box_cg[1][0], block_factor), np.arange(block_bounding_box_cg[0][1], block_bounding_box_cg[1][1], block_factor), np.arange(block_bounding_box_cg[0][2], block_bounding_box_cg[1][2], block_factor)) blocks = np.array(list(block_iter)) cg_info = cg.get_serialized_info() multi_args = [] for start_block in blocks: end_block = start_block + block_factor m = end_block > block_bounding_box_cg[1] end_block[m] = block_bounding_box_cg[1][m] multi_args.append( [cg_info, start_block, end_block, from_url, to_url, mip]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_write_flat_segmentation_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_write_flat_segmentation_thread, multi_args, n_threads=n_threads)
def create_chunked_graph(table_id=None, cv_url=None, ws_url=None, fan_out=2, bbox=None, chunk_size=(512, 512, 128), verbose=False, n_threads=1): """ Creates chunked graph from downloaded files :param table_id: str :param cv_url: str :param ws_url: str :param fan_out: int :param bbox: [[x_, y_, z_], [_x, _y, _z]] :param chunk_size: tuple :param verbose: bool :param n_threads: int """ if cv_url is None or ws_url is None: if "basil" in table_id: cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" ws_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/" elif "pinky40" in table_id: cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" ws_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/" elif "pinky100" in table_id: cv_url = "gs://nkem/pinky100_v0/region_graph/" ws_url = "gs://neuroglancer/nkem/pinky100_v0/ws/lost_no-random/bbox1_0/" else: raise Exception("Could not identify region graph ressource") times = [] time_start = time.time() chunk_size = np.array(list(chunk_size)) file_paths = np.sort( glob.glob( creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_url)) + "/*")) file_path_blocks = np.array_split(file_paths, n_threads * 3) multi_args = [] for fp_block in file_path_blocks: multi_args.append([fp_block, table_id, chunk_size, bbox]) if n_threads == 1: results = mu.multiprocess_func(_preprocess_chunkedgraph_data_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: results = mu.multisubprocess_func(_preprocess_chunkedgraph_data_thread, multi_args, n_threads=n_threads) in_chunk_connected_paths = np.array([]) in_chunk_connected_ids = np.array([], dtype=np.uint64).reshape(-1, 3) in_chunk_disconnected_paths = np.array([]) in_chunk_disconnected_ids = np.array([], dtype=np.uint64).reshape(-1, 3) between_chunk_paths = np.array([]) between_chunk_ids = np.array([], dtype=np.uint64).reshape(-1, 2, 3) isolated_paths = np.array([]) isolated_ids = np.array([], dtype=np.uint64).reshape(-1, 3) for result in results: in_chunk_connected_paths = np.concatenate( [in_chunk_connected_paths, result[0]]) in_chunk_connected_ids = np.concatenate( [in_chunk_connected_ids, result[1]]) in_chunk_disconnected_paths = np.concatenate( [in_chunk_disconnected_paths, result[2]]) in_chunk_disconnected_ids = np.concatenate( [in_chunk_disconnected_ids, result[3]]) between_chunk_paths = np.concatenate([between_chunk_paths, result[4]]) between_chunk_ids = np.concatenate([between_chunk_ids, result[5]]) isolated_paths = np.concatenate([isolated_paths, result[6]]) isolated_ids = np.concatenate([isolated_ids, result[7]]) assert len(in_chunk_connected_ids) == len(in_chunk_connected_paths) == \ len(in_chunk_disconnected_ids) == len(in_chunk_disconnected_paths) == \ len(isolated_ids) == len(isolated_paths) in_chunk_connected_ids, in_chunk_connected_paths = \ _sort_arrays(in_chunk_connected_ids, in_chunk_connected_paths) in_chunk_disconnected_ids, in_chunk_disconnected_paths = \ _sort_arrays(in_chunk_disconnected_ids, in_chunk_disconnected_paths) isolated_ids, isolated_paths = \ _sort_arrays(isolated_ids, isolated_paths) times.append(["Preprocessing", time.time() - time_start]) print("Preprocessing took %.3fs = %.2fh" % (times[-1][1], times[-1][1] / 3600)) time_start = time.time() multi_args = [] in_chunk_id_blocks = np.array_split(in_chunk_connected_ids, max(1, n_threads)) cumsum = 0 for in_chunk_id_block in in_chunk_id_blocks: multi_args.append([ between_chunk_ids, between_chunk_paths, in_chunk_id_block, cumsum ]) cumsum += len(in_chunk_id_block) # Run parallelizing if n_threads == 1: results = mu.multiprocess_func(_between_chunk_masks_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: results = mu.multisubprocess_func(_between_chunk_masks_thread, multi_args, n_threads=n_threads) times.append(["Data sorting", time.time() - time_start]) print("Data sorting took %.3fs = %.2fh" % (times[-1][1], times[-1][1] / 3600)) time_start = time.time() n_layers = int( np.ceil( pychunkedgraph.backend.chunkedgraph_utils.log_n( np.max(in_chunk_connected_ids) + 1, fan_out))) + 2 print("N layers: %d" % n_layers) cg = chunkedgraph.ChunkedGraph(table_id=table_id, n_layers=np.uint64(n_layers), fan_out=np.uint64(fan_out), chunk_size=np.array(chunk_size, dtype=np.uint64), cv_path=ws_url, is_new=True) # Fill lowest layer and create first abstraction layer # Create arguments for parallelizing multi_args = [] for result in results: offset, between_chunk_paths_out_masked, between_chunk_paths_in_masked = result for i_chunk in range(len(between_chunk_paths_out_masked)): multi_args.append([ table_id, in_chunk_connected_paths[offset + i_chunk], in_chunk_disconnected_paths[offset + i_chunk], isolated_paths[offset + i_chunk], between_chunk_paths_in_masked[i_chunk], between_chunk_paths_out_masked[i_chunk], verbose ]) random.shuffle(multi_args) print("%d jobs for creating layer 1 + 2" % len(multi_args)) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_create_atomic_layer_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_create_atomic_layer_thread, multi_args, n_threads=n_threads) times.append(["Layers 1 + 2", time.time() - time_start]) # Fill higher abstraction layers child_chunk_ids = in_chunk_connected_ids.copy() for layer_id in range(3, n_layers + 1): time_start = time.time() print("\n\n\n --- LAYER %d --- \n\n\n" % layer_id) parent_chunk_ids = child_chunk_ids // cg.fan_out parent_chunk_ids = parent_chunk_ids.astype(np.int) u_pcids, inds = np.unique(parent_chunk_ids, axis=0, return_inverse=True) if len(u_pcids) > n_threads: n_threads_per_process = 1 else: n_threads_per_process = int(np.ceil(n_threads / len(u_pcids))) multi_args = [] for ind in range(len(u_pcids)): multi_args.append([ table_id, layer_id, child_chunk_ids[inds == ind].astype(np.int), n_threads_per_process ]) child_chunk_ids = u_pcids # Run parallelizing if n_threads == 1: mu.multiprocess_func(_add_layer_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_add_layer_thread, multi_args, n_threads=n_threads, suffix=str(layer_id)) times.append(["Layer %d" % layer_id, time.time() - time_start]) for time_entry in times: print("%s: %.2fs = %.2fmin = %.2fh" % (time_entry[0], time_entry[1], time_entry[1] / 60, time_entry[1] / 3600))
def mesh_single_layer(self, layer, bounding_box=None, block_factor=2, n_threads=128): assert layer <= self.highest_mesh_layer dataset_bounding_box = np.array(self.cv.bounds.to_list()) block_bounding_box_cg = \ [np.floor(dataset_bounding_box[:3] / self.cg.chunk_size).astype(np.int), np.ceil(dataset_bounding_box[3:] / self.cg.chunk_size).astype(np.int)] if bounding_box is not None: bounding_box_cg = \ [np.floor(bounding_box[0] / self.cg.chunk_size).astype(np.int), np.ceil(bounding_box[1] / self.cg.chunk_size).astype(np.int)] m = block_bounding_box_cg[0] < bounding_box_cg[0] block_bounding_box_cg[0][m] = bounding_box_cg[0][m] m = block_bounding_box_cg[1] > bounding_box_cg[1] block_bounding_box_cg[1][m] = bounding_box_cg[1][m] block_bounding_box_cg /= 2 ** np.max([0, layer - 2]) block_bounding_box_cg = np.ceil(block_bounding_box_cg) n_jobs = np.product(block_bounding_box_cg[1] - block_bounding_box_cg[0]) / \ block_factor ** 2 < n_threads while n_jobs < n_threads and block_factor > 1: block_factor -= 1 n_jobs = np.product(block_bounding_box_cg[1] - block_bounding_box_cg[0]) / \ block_factor ** 2 < n_threads block_iter = itertools.product(np.arange(block_bounding_box_cg[0][0], block_bounding_box_cg[1][0], block_factor), np.arange(block_bounding_box_cg[0][1], block_bounding_box_cg[1][1], block_factor), np.arange(block_bounding_box_cg[0][2], block_bounding_box_cg[1][2], block_factor)) blocks = np.array(list(block_iter), dtype=np.int) cg_info = self.cg.get_serialized_info() del (cg_info['credentials']) multi_args = [] for start_block in blocks: end_block = start_block + block_factor m = end_block > block_bounding_box_cg[1] end_block[m] = block_bounding_box_cg[1][m] multi_args.append([cg_info, start_block, end_block, self.cg.cv_path, self.cv_mesh_dir, self.mesh_mip, layer]) random.shuffle(multi_args) random.shuffle(multi_args) # Run parallelizing if n_threads == 1: mu.multiprocess_func(meshgen._mesh_layer_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(meshgen._mesh_layer_thread, multi_args, n_threads=n_threads, suffix="%s_%d" % (self.table_id, layer))
def get_root_ids_and_sv_chunks(table_id, save_dir=f"{HOME}/benchmarks/", n_threads=1): cg = chunkedgraph.ChunkedGraph(table_id) save_folder = f"{save_dir}/{table_id}/" if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(f"{save_folder}/root_ids.h5"): root_ids = chunkedgraph_comp.get_latest_roots(cg, n_threads=n_threads) with h5py.File(f"{save_folder}/root_ids.h5", "w") as f: f.create_dataset("root_ids", data=root_ids) else: with h5py.File(f"{save_folder}/root_ids.h5", "r") as f: root_ids = f["root_ids"].value cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] order = np.arange(len(root_ids)) np.random.shuffle(order) order = order n_blocks = np.min([len(order), n_threads * 3]) blocks = np.array_split(order, n_blocks) multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, root_ids[block]]) if n_threads == 1: results = mu.multiprocess_func(_get_root_ids_and_sv_chunks, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_get_root_ids_and_sv_chunks, multi_args, n_threads=n_threads) root_ids = [] n_l1_nodes_per_root = [] rep_l1_nodes = [] rep_l1_chunk_ids = [] for result in results: root_ids.extend(result[0]) n_l1_nodes_per_root.extend(result[1]) rep_l1_nodes.extend(result[2]) rep_l1_chunk_ids.extend(result[3]) save_folder = f"{save_dir}/{table_id}/" if not os.path.exists(save_folder): os.makedirs(save_folder) with h5py.File(f"{save_folder}/root_stats.h5", "w") as f: f.create_dataset("root_ids", data=root_ids, compression="gzip") f.create_dataset("n_l1_nodes_per_root", data=n_l1_nodes_per_root, compression="gzip") f.create_dataset("rep_l1_nodes", data=rep_l1_nodes, compression="gzip") f.create_dataset("rep_l1_chunk_ids", data=rep_l1_chunk_ids, compression="gzip")
def get_merge_split_timings(table_id, save_dir=f"{HOME}/benchmarks/", job_size=500, n_threads=1): save_folder = f"{save_dir}/{table_id}/" merge_edges, merge_edge_weights = load_merge_stats(save_folder) probs = merge_edge_weights / np.sum(merge_edge_weights) if n_threads == 1: n_jobs = n_threads * 3 else: n_jobs = n_threads * 3 cg = chunkedgraph.ChunkedGraph(table_id) cg_serialized_info = cg.get_serialized_info() if n_threads > 0: del cg_serialized_info["credentials"] time_start = time.time() order = np.arange(len(merge_edges)) np.random.seed(np.int(time.time())) replace = False blocks = np.random.choice(order, job_size * n_jobs, p=probs, replace=replace).reshape(n_jobs, job_size) multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, merge_edges[block]]) print(f"Building jobs took {time.time()-time_start}s") time_start = time.time() if n_threads == 1: results = mu.multiprocess_func(_get_merge_timings, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_get_merge_timings, multi_args, n_threads=n_threads) dt = time.time() - time_start timings = [] for result in results: timings.extend(result[0]) percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)] mean = np.mean(timings) std = np.std(timings) median = np.median(timings) merge_results = { "percentiles": percentiles, "p01": percentiles[0], "p05": percentiles[4], "p95": percentiles[94], "p99": percentiles[98], "mean": mean, "std": std, "median": median, "total_time_s": dt, "job_size": job_size, "n_jobs": n_jobs, "n_threads": n_threads, "replace": replace, "requests_per_s": job_size * n_jobs / dt } timings = [] for result in results: timings.extend(result[1]) percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)] mean = np.mean(timings) std = np.std(timings) median = np.median(timings) split_results = { "percentiles": percentiles, "p01": percentiles[0], "p05": percentiles[4], "p95": percentiles[94], "p99": percentiles[98], "mean": mean, "std": std, "median": median, "total_time_s": dt, "job_size": job_size, "n_jobs": n_jobs, "n_threads": n_threads, "replace": replace, "requests_per_s": job_size * n_jobs / dt } return merge_results, split_results