예제 #1
0
def check_stored_cv_files(dataset_name="basil"):
    """ Tests if all files were downloaded

    :param dataset_name: str
    """
    if "basil" == dataset_name:
        cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
    elif "pinky40" == dataset_name:
        cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
    elif "pinky100" == dataset_name:
        cv_url = "gs://nkem/pinky100_v0/region_graph/"
    else:
        raise Exception("Could not identify region graph ressource")

    with storage.SimpleStorage(cv_url) as cv_st:
        dir_path = creator_utils.dir_from_layer_name(
            creator_utils.layer_name_from_cv_url(cv_st.layer_path))

        file_paths = list(cv_st.list_files())

    c = 0
    n_file_paths = len(file_paths)
    time_start = time.time()
    for i_fp, fp in enumerate(file_paths):
        if i_fp % 1000 == 1:
            dt = time.time() - time_start
            eta = dt / i_fp * n_file_paths - dt
            print("%d / %d - dt: %.3fs - eta: %.3fs" %
                  (i_fp, n_file_paths, dt, eta))

        if not os.path.exists(dir_path + fp[:-4] + ".h5"):
            print(dir_path + fp[:-4] + ".h5")
            c += 1

    print("%d files were missing" % c)
예제 #2
0
def rewrite_segmentation(dataset_name, n_threads=64, n_units_per_thread=None):
    if dataset_name == "pinky":
        cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
        from_url = "gs://neuroglancer/pinky40_v11/watershed/"
        to_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/"
    elif dataset_name == "basil":
        cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
        from_url = "gs://neuroglancer/ranl/basil_4k_oldnet/ws/"
        to_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/"
    else:
        raise Exception("Dataset unknown")

    file_paths = np.sort(glob.glob(creator_utils.dir_from_layer_name(
        creator_utils.layer_name_from_cv_url(cv_url)) + "/*rg2cg*"))

    if n_units_per_thread is None:
        file_path_blocks = np.array_split(file_paths, n_threads*3)
    else:
        n_blocks = int(np.ceil(len(file_paths) / n_units_per_thread))
        file_path_blocks = np.array_split(file_paths, n_blocks)

    multi_args = []
    for fp_block in file_path_blocks:
        multi_args.append([fp_block, from_url, to_url])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_rewrite_segmentation_thread, multi_args,
                             n_threads=n_threads, verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_rewrite_segmentation_thread, multi_args,
                                n_threads=n_threads)
예제 #3
0
def download_and_store_cv_files(dataset_name="basil",
                                n_threads=10,
                                olduint32=False):
    """ Downloads files from google cloud using cloud-volume

    :param dataset_name: str
    :param n_threads: int
    :param olduint32: bool
    """
    if "basil" == dataset_name:
        cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
    elif "pinky40" == dataset_name:
        cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
    elif "pinky100" == dataset_name:
        cv_url = "gs://nkem/pinky100_v0/region_graph/"
    else:
        raise Exception("Could not identify region graph ressource")

    with storage.SimpleStorage(cv_url) as cv_st:
        dir_path = creator_utils.dir_from_layer_name(
            creator_utils.layer_name_from_cv_url(cv_st.layer_path))

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        file_paths = list(cv_st.list_files())

    file_chunks = np.array_split(file_paths, n_threads * 3)
    multi_args = []
    for i_file_chunk, file_chunk in enumerate(file_chunks):
        multi_args.append([i_file_chunk, cv_url, file_chunk, olduint32])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_download_and_store_cv_files_thread,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_download_and_store_cv_files_thread,
                                multi_args,
                                n_threads=n_threads)
예제 #4
0
def create_chunked_graph(table_id=None,
                         cv_url=None,
                         ws_url=None,
                         fan_out=2,
                         bbox=None,
                         chunk_size=(512, 512, 128),
                         verbose=False,
                         n_threads=1):
    """ Creates chunked graph from downloaded files

    :param table_id: str
    :param cv_url: str
    :param ws_url: str
    :param fan_out: int
    :param bbox: [[x_, y_, z_], [_x, _y, _z]]
    :param chunk_size: tuple
    :param verbose: bool
    :param n_threads: int
    """
    if cv_url is None or ws_url is None:
        if "basil" in table_id:
            cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
            ws_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/"
        elif "pinky40" in table_id:
            cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
            ws_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/"
        elif "pinky100" in table_id:
            cv_url = "gs://nkem/pinky100_v0/region_graph/"
            ws_url = "gs://neuroglancer/nkem/pinky100_v0/ws/lost_no-random/bbox1_0/"
        else:
            raise Exception("Could not identify region graph ressource")

    times = []
    time_start = time.time()

    chunk_size = np.array(list(chunk_size))

    file_paths = np.sort(
        glob.glob(
            creator_utils.dir_from_layer_name(
                creator_utils.layer_name_from_cv_url(cv_url)) + "/*"))

    file_path_blocks = np.array_split(file_paths, n_threads * 3)

    multi_args = []
    for fp_block in file_path_blocks:
        multi_args.append([fp_block, table_id, chunk_size, bbox])

    if n_threads == 1:
        results = mu.multiprocess_func(_preprocess_chunkedgraph_data_thread,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=True,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_preprocess_chunkedgraph_data_thread,
                                          multi_args,
                                          n_threads=n_threads)

    in_chunk_connected_paths = np.array([])
    in_chunk_connected_ids = np.array([], dtype=np.uint64).reshape(-1, 3)
    in_chunk_disconnected_paths = np.array([])
    in_chunk_disconnected_ids = np.array([], dtype=np.uint64).reshape(-1, 3)
    between_chunk_paths = np.array([])
    between_chunk_ids = np.array([], dtype=np.uint64).reshape(-1, 2, 3)
    isolated_paths = np.array([])
    isolated_ids = np.array([], dtype=np.uint64).reshape(-1, 3)

    for result in results:
        in_chunk_connected_paths = np.concatenate(
            [in_chunk_connected_paths, result[0]])
        in_chunk_connected_ids = np.concatenate(
            [in_chunk_connected_ids, result[1]])
        in_chunk_disconnected_paths = np.concatenate(
            [in_chunk_disconnected_paths, result[2]])
        in_chunk_disconnected_ids = np.concatenate(
            [in_chunk_disconnected_ids, result[3]])
        between_chunk_paths = np.concatenate([between_chunk_paths, result[4]])
        between_chunk_ids = np.concatenate([between_chunk_ids, result[5]])
        isolated_paths = np.concatenate([isolated_paths, result[6]])
        isolated_ids = np.concatenate([isolated_ids, result[7]])

    assert len(in_chunk_connected_ids) == len(in_chunk_connected_paths) == \
           len(in_chunk_disconnected_ids) == len(in_chunk_disconnected_paths) == \
           len(isolated_ids) == len(isolated_paths)

    in_chunk_connected_ids, in_chunk_connected_paths = \
        _sort_arrays(in_chunk_connected_ids, in_chunk_connected_paths)

    in_chunk_disconnected_ids, in_chunk_disconnected_paths = \
        _sort_arrays(in_chunk_disconnected_ids, in_chunk_disconnected_paths)

    isolated_ids, isolated_paths = \
        _sort_arrays(isolated_ids, isolated_paths)

    times.append(["Preprocessing", time.time() - time_start])

    print("Preprocessing took %.3fs = %.2fh" %
          (times[-1][1], times[-1][1] / 3600))

    time_start = time.time()

    multi_args = []

    in_chunk_id_blocks = np.array_split(in_chunk_connected_ids,
                                        max(1, n_threads))
    cumsum = 0

    for in_chunk_id_block in in_chunk_id_blocks:
        multi_args.append([
            between_chunk_ids, between_chunk_paths, in_chunk_id_block, cumsum
        ])
        cumsum += len(in_chunk_id_block)

    # Run parallelizing
    if n_threads == 1:
        results = mu.multiprocess_func(_between_chunk_masks_thread,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=True,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_between_chunk_masks_thread,
                                          multi_args,
                                          n_threads=n_threads)

    times.append(["Data sorting", time.time() - time_start])

    print("Data sorting took %.3fs = %.2fh" %
          (times[-1][1], times[-1][1] / 3600))

    time_start = time.time()

    n_layers = int(
        np.ceil(
            pychunkedgraph.backend.chunkedgraph_utils.log_n(
                np.max(in_chunk_connected_ids) + 1, fan_out))) + 2

    print("N layers: %d" % n_layers)

    cg = chunkedgraph.ChunkedGraph(table_id=table_id,
                                   n_layers=np.uint64(n_layers),
                                   fan_out=np.uint64(fan_out),
                                   chunk_size=np.array(chunk_size,
                                                       dtype=np.uint64),
                                   cv_path=ws_url,
                                   is_new=True)

    # Fill lowest layer and create first abstraction layer
    # Create arguments for parallelizing

    multi_args = []
    for result in results:
        offset, between_chunk_paths_out_masked, between_chunk_paths_in_masked = result

        for i_chunk in range(len(between_chunk_paths_out_masked)):
            multi_args.append([
                table_id, in_chunk_connected_paths[offset + i_chunk],
                in_chunk_disconnected_paths[offset + i_chunk],
                isolated_paths[offset + i_chunk],
                between_chunk_paths_in_masked[i_chunk],
                between_chunk_paths_out_masked[i_chunk], verbose
            ])

    random.shuffle(multi_args)

    print("%d jobs for creating layer 1 + 2" % len(multi_args))

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_create_atomic_layer_thread,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_create_atomic_layer_thread,
                                multi_args,
                                n_threads=n_threads)

    times.append(["Layers 1 + 2", time.time() - time_start])

    # Fill higher abstraction layers
    child_chunk_ids = in_chunk_connected_ids.copy()
    for layer_id in range(3, n_layers + 1):

        time_start = time.time()

        print("\n\n\n --- LAYER %d --- \n\n\n" % layer_id)

        parent_chunk_ids = child_chunk_ids // cg.fan_out
        parent_chunk_ids = parent_chunk_ids.astype(np.int)

        u_pcids, inds = np.unique(parent_chunk_ids,
                                  axis=0,
                                  return_inverse=True)

        if len(u_pcids) > n_threads:
            n_threads_per_process = 1
        else:
            n_threads_per_process = int(np.ceil(n_threads / len(u_pcids)))

        multi_args = []
        for ind in range(len(u_pcids)):
            multi_args.append([
                table_id, layer_id,
                child_chunk_ids[inds == ind].astype(np.int),
                n_threads_per_process
            ])

        child_chunk_ids = u_pcids

        # Run parallelizing
        if n_threads == 1:
            mu.multiprocess_func(_add_layer_thread,
                                 multi_args,
                                 n_threads=n_threads,
                                 verbose=True,
                                 debug=n_threads == 1)
        else:
            mu.multisubprocess_func(_add_layer_thread,
                                    multi_args,
                                    n_threads=n_threads,
                                    suffix=str(layer_id))

        times.append(["Layer %d" % layer_id, time.time() - time_start])

    for time_entry in times:
        print("%s: %.2fs = %.2fmin = %.2fh" %
              (time_entry[0], time_entry[1], time_entry[1] / 60,
               time_entry[1] / 3600))