def check_stored_cv_files(dataset_name="basil"): """ Tests if all files were downloaded :param dataset_name: str """ if "basil" == dataset_name: cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" elif "pinky40" == dataset_name: cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" elif "pinky100" == dataset_name: cv_url = "gs://nkem/pinky100_v0/region_graph/" else: raise Exception("Could not identify region graph ressource") with storage.SimpleStorage(cv_url) as cv_st: dir_path = creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_st.layer_path)) file_paths = list(cv_st.list_files()) c = 0 n_file_paths = len(file_paths) time_start = time.time() for i_fp, fp in enumerate(file_paths): if i_fp % 1000 == 1: dt = time.time() - time_start eta = dt / i_fp * n_file_paths - dt print("%d / %d - dt: %.3fs - eta: %.3fs" % (i_fp, n_file_paths, dt, eta)) if not os.path.exists(dir_path + fp[:-4] + ".h5"): print(dir_path + fp[:-4] + ".h5") c += 1 print("%d files were missing" % c)
def rewrite_segmentation(dataset_name, n_threads=64, n_units_per_thread=None): if dataset_name == "pinky": cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" from_url = "gs://neuroglancer/pinky40_v11/watershed/" to_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/" elif dataset_name == "basil": cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" from_url = "gs://neuroglancer/ranl/basil_4k_oldnet/ws/" to_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/" else: raise Exception("Dataset unknown") file_paths = np.sort(glob.glob(creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_url)) + "/*rg2cg*")) if n_units_per_thread is None: file_path_blocks = np.array_split(file_paths, n_threads*3) else: n_blocks = int(np.ceil(len(file_paths) / n_units_per_thread)) file_path_blocks = np.array_split(file_paths, n_blocks) multi_args = [] for fp_block in file_path_blocks: multi_args.append([fp_block, from_url, to_url]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_rewrite_segmentation_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_rewrite_segmentation_thread, multi_args, n_threads=n_threads)
def download_and_store_cv_files(dataset_name="basil", n_threads=10, olduint32=False): """ Downloads files from google cloud using cloud-volume :param dataset_name: str :param n_threads: int :param olduint32: bool """ if "basil" == dataset_name: cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" elif "pinky40" == dataset_name: cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" elif "pinky100" == dataset_name: cv_url = "gs://nkem/pinky100_v0/region_graph/" else: raise Exception("Could not identify region graph ressource") with storage.SimpleStorage(cv_url) as cv_st: dir_path = creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_st.layer_path)) if not os.path.exists(dir_path): os.makedirs(dir_path) file_paths = list(cv_st.list_files()) file_chunks = np.array_split(file_paths, n_threads * 3) multi_args = [] for i_file_chunk, file_chunk in enumerate(file_chunks): multi_args.append([i_file_chunk, cv_url, file_chunk, olduint32]) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_download_and_store_cv_files_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_download_and_store_cv_files_thread, multi_args, n_threads=n_threads)
def create_chunked_graph(table_id=None, cv_url=None, ws_url=None, fan_out=2, bbox=None, chunk_size=(512, 512, 128), verbose=False, n_threads=1): """ Creates chunked graph from downloaded files :param table_id: str :param cv_url: str :param ws_url: str :param fan_out: int :param bbox: [[x_, y_, z_], [_x, _y, _z]] :param chunk_size: tuple :param verbose: bool :param n_threads: int """ if cv_url is None or ws_url is None: if "basil" in table_id: cv_url = "gs://nkem/basil_4k_oldnet/region_graph/" ws_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/" elif "pinky40" in table_id: cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/" ws_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/" elif "pinky100" in table_id: cv_url = "gs://nkem/pinky100_v0/region_graph/" ws_url = "gs://neuroglancer/nkem/pinky100_v0/ws/lost_no-random/bbox1_0/" else: raise Exception("Could not identify region graph ressource") times = [] time_start = time.time() chunk_size = np.array(list(chunk_size)) file_paths = np.sort( glob.glob( creator_utils.dir_from_layer_name( creator_utils.layer_name_from_cv_url(cv_url)) + "/*")) file_path_blocks = np.array_split(file_paths, n_threads * 3) multi_args = [] for fp_block in file_path_blocks: multi_args.append([fp_block, table_id, chunk_size, bbox]) if n_threads == 1: results = mu.multiprocess_func(_preprocess_chunkedgraph_data_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: results = mu.multisubprocess_func(_preprocess_chunkedgraph_data_thread, multi_args, n_threads=n_threads) in_chunk_connected_paths = np.array([]) in_chunk_connected_ids = np.array([], dtype=np.uint64).reshape(-1, 3) in_chunk_disconnected_paths = np.array([]) in_chunk_disconnected_ids = np.array([], dtype=np.uint64).reshape(-1, 3) between_chunk_paths = np.array([]) between_chunk_ids = np.array([], dtype=np.uint64).reshape(-1, 2, 3) isolated_paths = np.array([]) isolated_ids = np.array([], dtype=np.uint64).reshape(-1, 3) for result in results: in_chunk_connected_paths = np.concatenate( [in_chunk_connected_paths, result[0]]) in_chunk_connected_ids = np.concatenate( [in_chunk_connected_ids, result[1]]) in_chunk_disconnected_paths = np.concatenate( [in_chunk_disconnected_paths, result[2]]) in_chunk_disconnected_ids = np.concatenate( [in_chunk_disconnected_ids, result[3]]) between_chunk_paths = np.concatenate([between_chunk_paths, result[4]]) between_chunk_ids = np.concatenate([between_chunk_ids, result[5]]) isolated_paths = np.concatenate([isolated_paths, result[6]]) isolated_ids = np.concatenate([isolated_ids, result[7]]) assert len(in_chunk_connected_ids) == len(in_chunk_connected_paths) == \ len(in_chunk_disconnected_ids) == len(in_chunk_disconnected_paths) == \ len(isolated_ids) == len(isolated_paths) in_chunk_connected_ids, in_chunk_connected_paths = \ _sort_arrays(in_chunk_connected_ids, in_chunk_connected_paths) in_chunk_disconnected_ids, in_chunk_disconnected_paths = \ _sort_arrays(in_chunk_disconnected_ids, in_chunk_disconnected_paths) isolated_ids, isolated_paths = \ _sort_arrays(isolated_ids, isolated_paths) times.append(["Preprocessing", time.time() - time_start]) print("Preprocessing took %.3fs = %.2fh" % (times[-1][1], times[-1][1] / 3600)) time_start = time.time() multi_args = [] in_chunk_id_blocks = np.array_split(in_chunk_connected_ids, max(1, n_threads)) cumsum = 0 for in_chunk_id_block in in_chunk_id_blocks: multi_args.append([ between_chunk_ids, between_chunk_paths, in_chunk_id_block, cumsum ]) cumsum += len(in_chunk_id_block) # Run parallelizing if n_threads == 1: results = mu.multiprocess_func(_between_chunk_masks_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: results = mu.multisubprocess_func(_between_chunk_masks_thread, multi_args, n_threads=n_threads) times.append(["Data sorting", time.time() - time_start]) print("Data sorting took %.3fs = %.2fh" % (times[-1][1], times[-1][1] / 3600)) time_start = time.time() n_layers = int( np.ceil( pychunkedgraph.backend.chunkedgraph_utils.log_n( np.max(in_chunk_connected_ids) + 1, fan_out))) + 2 print("N layers: %d" % n_layers) cg = chunkedgraph.ChunkedGraph(table_id=table_id, n_layers=np.uint64(n_layers), fan_out=np.uint64(fan_out), chunk_size=np.array(chunk_size, dtype=np.uint64), cv_path=ws_url, is_new=True) # Fill lowest layer and create first abstraction layer # Create arguments for parallelizing multi_args = [] for result in results: offset, between_chunk_paths_out_masked, between_chunk_paths_in_masked = result for i_chunk in range(len(between_chunk_paths_out_masked)): multi_args.append([ table_id, in_chunk_connected_paths[offset + i_chunk], in_chunk_disconnected_paths[offset + i_chunk], isolated_paths[offset + i_chunk], between_chunk_paths_in_masked[i_chunk], between_chunk_paths_out_masked[i_chunk], verbose ]) random.shuffle(multi_args) print("%d jobs for creating layer 1 + 2" % len(multi_args)) # Run parallelizing if n_threads == 1: mu.multiprocess_func(_create_atomic_layer_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_create_atomic_layer_thread, multi_args, n_threads=n_threads) times.append(["Layers 1 + 2", time.time() - time_start]) # Fill higher abstraction layers child_chunk_ids = in_chunk_connected_ids.copy() for layer_id in range(3, n_layers + 1): time_start = time.time() print("\n\n\n --- LAYER %d --- \n\n\n" % layer_id) parent_chunk_ids = child_chunk_ids // cg.fan_out parent_chunk_ids = parent_chunk_ids.astype(np.int) u_pcids, inds = np.unique(parent_chunk_ids, axis=0, return_inverse=True) if len(u_pcids) > n_threads: n_threads_per_process = 1 else: n_threads_per_process = int(np.ceil(n_threads / len(u_pcids))) multi_args = [] for ind in range(len(u_pcids)): multi_args.append([ table_id, layer_id, child_chunk_ids[inds == ind].astype(np.int), n_threads_per_process ]) child_chunk_ids = u_pcids # Run parallelizing if n_threads == 1: mu.multiprocess_func(_add_layer_thread, multi_args, n_threads=n_threads, verbose=True, debug=n_threads == 1) else: mu.multisubprocess_func(_add_layer_thread, multi_args, n_threads=n_threads, suffix=str(layer_id)) times.append(["Layer %d" % layer_id, time.time() - time_start]) for time_entry in times: print("%s: %.2fs = %.2fmin = %.2fh" % (time_entry[0], time_entry[1], time_entry[1] / 60, time_entry[1] / 3600))