Пример #1
0
def run_task_bundle(settings, layer, roi):
    cgraph = chunkedgraph.ChunkedGraph(
        table_id=settings['chunkedgraph']['table_id'],
        instance_id=settings['chunkedgraph']['instance_id'])
    meshing = settings['meshing']
    mip = meshing.get('mip', 2)
    max_err = meshing.get('max_simplification_error', 40)
    mesh_dir = meshing.get('mesh_dir', None)

    base_chunk_span = int(cgraph.fan_out)**max(0, layer - 2)
    chunksize = np.array(cgraph.chunk_size, dtype=np.int) * base_chunk_span

    for x in range(roi[0].start, roi[0].stop, chunksize[0]):
        for y in range(roi[1].start, roi[1].stop, chunksize[1]):
            for z in range(roi[2].start, roi[2].stop, chunksize[2]):
                chunk_id = cgraph.get_chunk_id_from_coord(layer, x, y, z)

                try:
                    chunk_mesh_task(cgraph,
                                    chunk_id,
                                    cgraph._cv_path,
                                    cv_mesh_dir=mesh_dir,
                                    mip=mip,
                                    max_err=max_err)
                except EmptyVolumeException as e:
                    print("Warning: Empty segmentation encountered: %s" % e)
Пример #2
0
def _family_consistency_test_thread(args):
    """ Helper to test family consistency """

    table_id, coord, layer_id = args

    x, y, z = coord

    cg = chunkedgraph.ChunkedGraph(table_id)

    rows = cg.range_read_chunk(layer_id, x, y, z)

    failed_node_ids = []

    time_start = time.time()
    for i_k, k in enumerate(rows.keys()):
        if i_k % 100 == 1:
            dt = time.time() - time_start
            eta = dt / i_k * len(rows) - dt
            print("%d / %d - %.3fs -> %.3fs      " % (i_k, len(rows), dt, eta),
                  end="\r")

        node_id = chunkedgraph.deserialize_uint64(k)
        parent_id = np.frombuffer(rows[k].cells["0"][b'parents'][0].value,
                                  dtype=np.uint64)
        if not node_id in cg.get_children(parent_id):
            failed_node_ids.append([node_id, parent_id])

    return failed_node_ids
Пример #3
0
def _count_and_download_nodes(args):
    serialized_cg_info, chunk_coords = args

    time_start = time.time()

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    n_nodes_per_l2_node = []
    n_l2_nodes_per_chunk = []
    n_l1_nodes_per_chunk = []
    # l1_nodes = []
    rep_l1_nodes = []
    for chunk_coord in chunk_coords:
        x, y, z = chunk_coord
        rr = cg.range_read_chunk(layer=2,
                                 x=x,
                                 y=y,
                                 z=z,
                                 columns=[column_keys.Hierarchy.Child])

        n_l2_nodes_per_chunk.append(len(rr))
        n_l1_nodes = 0

        for k in rr.keys():
            children = rr[k][column_keys.Hierarchy.Child][0].value
            rep_l1_nodes.append(children[np.random.randint(0, len(children))])
            # l1_nodes.extend(children)

            n_nodes_per_l2_node.append(len(children))
            n_l1_nodes += len(children)

        n_l1_nodes_per_chunk.append(n_l1_nodes)

    print(f"{len(chunk_coords)} took {time.time() - time_start}s")
    return n_nodes_per_l2_node, n_l2_nodes_per_chunk, n_l1_nodes_per_chunk, rep_l1_nodes
Пример #4
0
def initialize_chunkedgraph(
    meta: ChunkedGraphMeta, cg_mesh_dir="mesh_dir", n_bits_root_counter=8, size=None
):
    """ Initalizes a chunkedgraph on BigTable """
    _check_table_existence(meta.bigtable_config, meta.graph_config)
    ws_cv = cloudvolume.CloudVolume(meta.data_source.watershed)
    if size is not None:
        size = np.array(size)
        for i in range(len(ws_cv.info["scales"])):
            original_size = ws_cv.info["scales"][i]["size"]
            size = np.min([size, original_size], axis=0)
            ws_cv.info["scales"][i]["size"] = [int(x) for x in size]
            size[:-1] //= 2

    dataset_info = ws_cv.info
    dataset_info["mesh"] = cg_mesh_dir
    dataset_info["data_dir"] = meta.data_source.watershed
    dataset_info["graph"] = {
        "chunk_size": [int(s) for s in meta.graph_config.chunk_size]
    }

    kwargs = {
        "instance_id": meta.bigtable_config.instance_id,
        "project_id": meta.bigtable_config.project_id,
        "table_id": meta.graph_config.graph_id,
        "chunk_size": meta.graph_config.chunk_size,
        "fan_out": np.uint64(meta.graph_config.fanout),
        "n_layers": np.uint64(meta.layer_count),
        "dataset_info": dataset_info,
        "use_skip_connections": meta.graph_config.use_skip_connections,
        "s_bits_atomic_layer": meta.graph_config.s_bits_atomic_layer,
        "n_bits_root_counter": n_bits_root_counter,
        "is_new": True,
    }
    return chunkedgraph.ChunkedGraph(**kwargs)
Пример #5
0
def children_test(table_id, layer, coord_list):

    cg = chunkedgraph.ChunkedGraph(table_id)

    for coords in coord_list:
        x, y, z = coords

        node_ids = cg.range_read_chunk(layer, x, y, z, row_keys=['children'])
        all_children = []
        children_chunks = []
        for node_id_b, data in node_ids.items():
            children = np.frombuffer(data.cells['0'][b'children'][0].value,
                                     dtype=np.uint64)
            for child in children:
                all_children.append(child)
                children_chunks.append(cg.get_chunk_id(child))

        u_children_chunks, c_children_chunks = np.unique(children_chunks,
                                                         return_counts=True)
        u_chunk_coords = [cg.get_chunk_coordinates(c) for c in u_children_chunks]

        print("\n--- Layer %d ---- [%d, %d, %d] ---" % (layer, x, y, z))
        print("N(all children): %d" % len(all_children))
        print("N(unique children): %d" % len(np.unique(all_children)))
        print("N(unique children chunks): %d" % len(u_children_chunks))
        print("Unique children chunk coords", u_chunk_coords)
        print("N(ids per unique children chunk):", c_children_chunks)
Пример #6
0
def get_cg(table_id):
    if table_id not in cache:
        instance_id = current_app.config['CHUNKGRAPH_INSTANCE_ID']
        client = get_bigtable_client(current_app.config)

        # Create ChunkedGraph logging
        logger = logging.getLogger(f"{instance_id}/{table_id}")
        logger.setLevel(current_app.config['LOGGING_LEVEL'])

        # prevent duplicate logs from Flasks(?) parent logger
        logger.propagate = False

        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(current_app.config['LOGGING_LEVEL'])
        formatter = jsonformatter.JsonFormatter(
            fmt=current_app.config['LOGGING_FORMAT'],
            datefmt=current_app.config['LOGGING_DATEFORMAT'])
        formatter.converter = time.gmtime
        handler.setFormatter(formatter)

        logger.addHandler(handler)

        # Create ChunkedGraph
        cache[table_id] = chunkedgraph.ChunkedGraph(table_id=table_id,
                                                    instance_id=instance_id,
                                                    client=client,
                                                    logger=logger)
    current_app.table_id = table_id
    return cache[table_id]
Пример #7
0
 def cg(self):
     if self._cg is None:
         self._cg = chunkedgraph.ChunkedGraph(
             table_id=self.table_id,
             instance_id=self.instance_id,
             project_id=self.project_id)
     return self._cg
Пример #8
0
def _write_flat_segmentation_thread(args):
    """ Helper of write_flat_segmentation """
    cg_info, start_block, end_block, from_url, to_url, mip = args

    assert 'segmentation' in to_url
    assert 'svenmd' in to_url

    from_cv = cloudvolume.CloudVolume(from_url, mip=mip)
    to_cv = cloudvolume.CloudVolume(to_url, mip=mip)

    cg = chunkedgraph.ChunkedGraph(table_id=cg_info["table_id"],
                                   instance_id=cg_info["instance_id"],
                                   project_id=cg_info["project_id"],
                                   credentials=cg_info["credentials"])

    for block_z in range(start_block[2], end_block[2]):
        z_start = block_z * cg.chunk_size[2]
        z_end = (block_z + 1) * cg.chunk_size[2]
        for block_y in range(start_block[1], end_block[1]):
            y_start = block_y * cg.chunk_size[1]
            y_end = (block_y + 1) * cg.chunk_size[1]
            for block_x in range(start_block[0], end_block[0]):
                x_start = block_x * cg.chunk_size[0]
                x_end = (block_x + 1) * cg.chunk_size[0]

                block = from_cv[x_start:x_end, y_start:y_end, z_start:z_end]

                _, remapped_block = get_sv_to_root_id_mapping_chunk(
                    cg, [x_start, y_start, z_start], block)

                to_cv[x_start:x_end, y_start:y_end,
                      z_start:z_end] = remapped_block
Пример #9
0
def initialize_chunkedgraph(cg_table_id,
                            ws_cv_path,
                            chunk_size,
                            cg_mesh_dir,
                            fan_out=2,
                            instance_id=None,
                            project_id=None):
    """ Initalizes a chunkedgraph on BigTable

    :param cg_table_id: str
        name of chunkedgraph
    :param ws_cv_path: str
        path to watershed segmentation on Google Cloud
    :param chunk_size: np.ndarray
        array of three ints
    :param cg_mesh_dir: str
        mesh folder name
    :param fan_out: int
        fan out of chunked graph (2 == Octree)
    :param instance_id: str
        Google instance id
    :param project_id: str
        Google project id
    :return: ChunkedGraph
    """
    ws_cv = cloudvolume.CloudVolume(ws_cv_path)
    bbox = np.array(ws_cv.bounds.to_list()).reshape(2, 3)

    # assert np.all(bbox[0] == 0)
    # assert np.all((bbox[1] % chunk_size) == 0)

    n_chunks = ((bbox[1] - bbox[0]) / chunk_size).astype(np.int)
    n_layers = int(np.ceil(chunkedgraph_utils.log_n(np.max(n_chunks),
                                                    fan_out))) + 2

    dataset_info = ws_cv.info
    dataset_info["mesh"] = cg_mesh_dir
    dataset_info["data_dir"] = ws_cv_path
    dataset_info["graph"] = {"chunk_size": [int(s) for s in chunk_size]}

    kwargs = {
        "table_id": cg_table_id,
        "chunk_size": chunk_size,
        "fan_out": np.uint64(fan_out),
        "n_layers": np.uint64(n_layers),
        "dataset_info": dataset_info,
        "is_new": True
    }

    if instance_id is not None:
        kwargs["instance_id"] = instance_id

    if project_id is not None:
        kwargs["project_id"] = project_id

    cg = chunkedgraph.ChunkedGraph(**kwargs)

    return cg
Пример #10
0
def _remeshing(serialized_cg_info, lvl2_nodes):
    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    # TODO: stop_layer and mip should be configurable by dataset
    meshgen.remeshing(cg,
                      lvl2_nodes,
                      stop_layer=4,
                      mesh_path=None,
                      mip=1,
                      max_err=320)

    return Response(status=200)
Пример #11
0
def get_merge_candidates(table_id,
                         save_dir=f"{HOME}/benchmarks/",
                         n_threads=1):
    cg = chunkedgraph.ChunkedGraph(table_id)

    bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T
    bounds -= bounds[:, 0:1]

    chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int)

    chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds])
    chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int)

    order = np.arange(len(chunk_coords))
    np.random.shuffle(order)

    n_blocks = np.min([len(order), n_threads * 3])
    blocks = np.array_split(order, n_blocks)

    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 1:
        del cg_serialized_info["credentials"]

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, chunk_coords[block]])

    if n_threads == 1:
        results = mu.multiprocess_func(_get_merge_candidates,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_get_merge_candidates,
                                          multi_args,
                                          n_threads=n_threads)
    merge_edges = []
    merge_edge_weights = []
    for result in results:
        merge_edges.extend(result[0])
        merge_edge_weights.extend(result[1])

    save_folder = f"{save_dir}/{table_id}/"

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    with h5py.File(f"{save_folder}/merge_edge_stats.h5", "w") as f:
        f.create_dataset("merge_edges", data=merge_edges, compression="gzip")
        f.create_dataset("merge_edge_weights",
                         data=merge_edge_weights,
                         compression="gzip")
Пример #12
0
def _mesh_lvl2_nodes(serialized_cg_info, lvl2_nodes):
    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    for lvl2_node in lvl2_nodes:
        print(lvl2_node)
        meshgen.mesh_lvl2_preview(cg, lvl2_node, supervoxel_ids=None,
                                  cv_path=None, cv_mesh_dir=None, mip=2,
                                  simplification_factor=999999,
                                  max_err=40, parallel_download=1,
                                  verbose=True,
                                  cache_control='no-cache')

    return Response(status=200)
Пример #13
0
def _get_root_timings(args):
    serialized_cg_info, l1_ids = args
    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    timings = []
    for l1_id in l1_ids:

        time_start = time.time()
        root = cg.get_root(l1_id)
        dt = time.time() - time_start
        timings.append(dt)

    return timings
Пример #14
0
    def cg(self):
        if self._cg is None:
            kwargs = {}

            if self._instance_id is not None:
                kwargs["instance_id"] = self._instance_id

            if self._project_id is not None:
                kwargs["project_id"] = self._project_id

            self._cg = chunkedgraph.ChunkedGraph(table_id=self._cg_table_id,
                                                 **kwargs)

        return self._cg
Пример #15
0
def _get_subgraph_timings(args):
    serialized_cg_info, root_ids, rep_l1_chunk_ids = args
    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    timings = []
    for root_id, rep_l1_chunk_id in zip(root_ids, rep_l1_chunk_ids):
        bb = np.array([rep_l1_chunk_id, rep_l1_chunk_id + 1], dtype=np.int)

        time_start = time.time()
        sv_ids = cg.get_subgraph_nodes(root_id, bb, bb_is_coordinate=False)
        dt = time.time() - time_start
        timings.append(dt)

    return timings
Пример #16
0
def _get_merge_candidates(args):
    serialized_cg_info, chunk_coords = args

    time_start = time.time()

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    merge_edges = []
    merge_edge_weights = []
    for chunk_coord in chunk_coords:
        chunk_id = cg.get_chunk_id(layer=1,
                                   x=chunk_coord[0],
                                   y=chunk_coord[1],
                                   z=chunk_coord[2])

        rr = cg.range_read_chunk(chunk_id=chunk_id,
                                 columns=[
                                     column_keys.Connectivity.Partner,
                                     column_keys.Connectivity.Connected,
                                     column_keys.Hierarchy.Parent
                                 ])

        ps = []
        edges = []
        for it in rr.items():
            e, _, _ = cg._retrieve_connectivity(it, connected_edges=False)
            edges.extend(e)
            ps.extend([it[1][column_keys.Hierarchy.Parent][0].value] * len(e))

        if len(edges) == 0:
            continue

        edges = np.sort(np.array(edges), axis=1)
        cols = {"sv1": edges[:, 0], "sv2": edges[:, 1], "parent": ps}

        df = pd.DataFrame(data=cols)
        dfg = df.groupby(["sv1", "sv2"]).aggregate(np.sum).reset_index()

        _, i, c = np.unique(dfg[["parent"]],
                            return_counts=True,
                            return_index=True)

        merge_edges.extend(
            np.array(dfg.loc[i][["sv1", "sv2"]], dtype=np.uint64))
        merge_edge_weights.extend(c)

    print(f"{len(chunk_coords)} took {time.time() - time_start}s")

    return merge_edges, merge_edge_weights
Пример #17
0
def _mesh_lvl2_previews_threads(args):
    serialized_cg_info, lvl2_node_id, supervoxel_ids, \
        cv_path, cv_mesh_dir, mip, simplification_factor, \
        max_err, parallel_download, verbose, cache_control = args

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)
    mesh_lvl2_preview(cg,
                      lvl2_node_id,
                      supervoxel_ids=supervoxel_ids,
                      cv_path=cv_path,
                      cv_mesh_dir=cv_mesh_dir,
                      mip=mip,
                      simplification_factor=simplification_factor,
                      max_err=max_err,
                      parallel_download=parallel_download,
                      verbose=verbose,
                      cache_control=cache_control)
Пример #18
0
def count_nodes_and_edges(table_id, n_threads=1):
    cg = chunkedgraph.ChunkedGraph(table_id)

    bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T
    bounds -= bounds[:, 0:1]

    chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int)

    chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds])
    chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int)

    order = np.arange(len(chunk_coords))
    np.random.shuffle(order)

    n_blocks = np.min([len(order), n_threads * 3])
    blocks = np.array_split(order, n_blocks)

    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 1:
        del cg_serialized_info["credentials"]

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, chunk_coords[block]])

    if n_threads == 1:
        results = mu.multiprocess_func(_count_nodes_and_edges,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_count_nodes_and_edges,
                                          multi_args,
                                          n_threads=n_threads)

    n_edges_per_chunk = []
    n_nodes_per_chunk = []
    for result in results:
        n_nodes_per_chunk.extend(result[0])
        n_edges_per_chunk.extend(result[1])

    return n_nodes_per_chunk, n_edges_per_chunk
Пример #19
0
def _get_root_ids_and_sv_chunks(args):
    serialized_cg_info, root_ids = args

    time_start = time.time()

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    n_l1_nodes_per_root = []
    rep_l1_nodes = []
    rep_l1_chunk_ids = []
    for root_id in root_ids:
        l1_ids = cg.get_subgraph_nodes(root_id)

        n_l1_nodes_per_root.append(len(l1_ids))
        rep_l1_node = l1_ids[np.random.randint(0, len(l1_ids))]
        rep_l1_nodes.append(rep_l1_node)
        rep_l1_chunk_ids.append(cg.get_chunk_coordinates(rep_l1_node))

    print(f"{len(root_ids)} took {time.time() - time_start}s")
    return root_ids, n_l1_nodes_per_root, rep_l1_nodes, rep_l1_chunk_ids
Пример #20
0
def _read_root_rows_thread(args) -> list:
    start_seg_id, end_seg_id, serialized_cg_info, time_stamp = args

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    start_id = cg.get_node_id(segment_id=start_seg_id,
                              chunk_id=cg.root_chunk_id)
    end_id = cg.get_node_id(segment_id=end_seg_id, chunk_id=cg.root_chunk_id)

    rows = cg.read_node_id_rows(start_id=start_id,
                                end_id=end_id,
                                end_id_inclusive=False,
                                end_time=time_stamp,
                                end_time_inclusive=True)

    root_ids = [
        k for (k, v) in rows.items()
        if column_keys.Hierarchy.NewParent not in v
    ]

    return root_ids
Пример #21
0
def family_consistency_test(table_id, n_threads=64):
    """ Runs a simple test on the WHOLE graph

    tests: id in children(parent(id))

    :param table_id: str
    :param n_threads: int
    :return: dict
        n x 2 per layer
        each failed pair: (node_id, parent_id)
    """

    cg = chunkedgraph.ChunkedGraph(table_id)

    failed_node_id_dict = {}
    for layer_id in range(1, cg.n_layers):
        print("\n\n Layer %d \n\n" % layer_id)

        step = int(cg.fan_out ** np.max([0, layer_id - 2]))
        coords = list(itertools.product(range(0, 8, step),
                                        range(0, 8, step),
                                        range(0, 4, step)))

        multi_args = []
        for coord in coords:
            multi_args.append([table_id, coord, layer_id])

        collected_failed_node_ids = mu.multisubprocess_func(
            _family_consistency_test_thread, multi_args, n_threads=n_threads)

        failed_node_ids = []
        for _failed_node_ids in collected_failed_node_ids:
            failed_node_ids.extend(_failed_node_ids)

        failed_node_id_dict[layer_id] = np.array(failed_node_ids)

        print("\n%d nodes rows failed\n" % len(failed_node_ids))

    return failed_node_id_dict
Пример #22
0
def _count_nodes_and_edges(args):
    serialized_cg_info, chunk_coords = args

    time_start = time.time()

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    n_edges_per_chunk = []
    n_nodes_per_chunk = []
    for chunk_coord in chunk_coords:
        x, y, z = chunk_coord
        rr = cg.range_read_chunk(layer=1, x=x, y=y, z=z)

        n_nodes_per_chunk.append(len(rr))
        n_edges = 0

        for k in rr.keys():
            n_edges += len(rr[k][column_keys.Connectivity.Partner][0].value)

        n_edges_per_chunk.append(n_edges)

    print(f"{len(chunk_coords)} took {time.time() - time_start}s")
    return n_nodes_per_chunk, n_edges_per_chunk
Пример #23
0
def _get_merge_timings(args):
    serialized_cg_info, merge_edges = args
    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    merge_timings = []
    for merge_edge in merge_edges:
        time_start = time.time()
        root_ids = cg.add_edges(user_id="ChuckNorris",
                                atomic_edges=[merge_edge]).new_root_ids
        dt = time.time() - time_start
        merge_timings.append(dt)

    split_timings = []
    for merge_edge in merge_edges:
        time_start = time.time()
        root_ids = cg.remove_edges(user_id="ChuckNorris",
                                   atomic_edges=[merge_edge],
                                   mincut=False).new_root_ids

        dt = time.time() - time_start
        split_timings.append(dt)

    return merge_timings, split_timings
Пример #24
0
def get_cg(table_id):
    assert (
        table_id.startswith("fly")
        or table_id.startswith("golden")
        or table_id.startswith("pinky100_rv")
        or table_id.startswith("pinky100_arv")
    )

    if table_id not in CACHE:
        instance_id = current_app.config["CHUNKGRAPH_INSTANCE_ID"]
        client = get_bigtable_client(current_app.config)

        # Create ChunkedGraph logging
        logger = logging.getLogger(f"{instance_id}/{table_id}")
        logger.setLevel(current_app.config["LOGGING_LEVEL"])

        # prevent duplicate logs from Flasks(?) parent logger
        logger.propagate = False

        handler = logging.StreamHandler(sys.stdout)
        handler.setLevel(current_app.config["LOGGING_LEVEL"])
        formatter = jsonformatter.JsonFormatter(
            fmt=current_app.config["LOGGING_FORMAT"],
            datefmt=current_app.config["LOGGING_DATEFORMAT"],
        )
        formatter.converter = time.gmtime
        handler.setFormatter(formatter)

        logger.addHandler(handler)

        # Create ChunkedGraph
        CACHE[table_id] = chunkedgraph.ChunkedGraph(
            table_id=table_id, instance_id=instance_id, client=client, logger=logger
        )

    current_app.table_id = table_id
    return CACHE[table_id]
Пример #25
0
def _read_delta_root_rows_thread(args) -> Sequence[list]:
    start_seg_id, end_seg_id, serialized_cg_info, time_stamp_start, time_stamp_end = args

    cg = chunkedgraph.ChunkedGraph(**serialized_cg_info)

    start_id = cg.get_node_id(segment_id=start_seg_id,
                              chunk_id=cg.root_chunk_id)
    end_id = cg.get_node_id(segment_id=end_seg_id, chunk_id=cg.root_chunk_id)

    # apply column filters to avoid Lock columns
    rows = cg.read_node_id_rows(start_id=start_id,
                                start_time=time_stamp_start,
                                end_id=end_id,
                                end_id_inclusive=False,
                                columns=[
                                    column_keys.Hierarchy.FormerParent,
                                    column_keys.Hierarchy.NewParent
                                ],
                                end_time=time_stamp_end,
                                end_time_inclusive=True)

    # new roots are those that have no NewParent in this time window
    new_root_ids = [
        k for (k, v) in rows.items()
        if column_keys.Hierarchy.NewParent not in v
    ]

    # expired roots are the IDs of FormerParent's
    # whose timestamp is before the start_time
    expired_root_ids = []
    for k, v in rows.items():
        if column_keys.Hierarchy.FormerParent in v:
            fp = v[column_keys.Hierarchy.FormerParent]
            for cell_entry in fp:
                expired_root_ids.extend(cell_entry.value)

    return new_root_ids, expired_root_ids
Пример #26
0
    def _cgraph(request, fan_out=2, n_layers=10):
        # setup Chunked Graph
        dataset_info = {"data_dir": ""}

        graph = chunkedgraph.ChunkedGraph(
            request.function.__name__,
            project_id="IGNORE_ENVIRONMENT_PROJECT",
            credentials=credentials.AnonymousCredentials(),
            instance_id="emulated_instance",
            dataset_info=dataset_info,
            chunk_size=np.array([512, 512, 64], dtype=np.uint64),
            is_new=True,
            fan_out=np.uint64(fan_out),
            n_layers=np.uint64(n_layers),
        )

        graph._cv = CloudVolumeMock()

        # setup Chunked Graph - Finalizer
        def fin():
            graph.table.delete()

        request.addfinalizer(fin)
        return graph
Пример #27
0
def get_root_ids_and_sv_chunks(table_id,
                               save_dir=f"{HOME}/benchmarks/",
                               n_threads=1):
    cg = chunkedgraph.ChunkedGraph(table_id)

    save_folder = f"{save_dir}/{table_id}/"

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    if not os.path.exists(f"{save_folder}/root_ids.h5"):
        root_ids = chunkedgraph_comp.get_latest_roots(cg, n_threads=n_threads)

        with h5py.File(f"{save_folder}/root_ids.h5", "w") as f:
            f.create_dataset("root_ids", data=root_ids)
    else:
        with h5py.File(f"{save_folder}/root_ids.h5", "r") as f:
            root_ids = f["root_ids"].value

    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 1:
        del cg_serialized_info["credentials"]

    order = np.arange(len(root_ids))
    np.random.shuffle(order)

    order = order

    n_blocks = np.min([len(order), n_threads * 3])
    blocks = np.array_split(order, n_blocks)

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, root_ids[block]])

    if n_threads == 1:
        results = mu.multiprocess_func(_get_root_ids_and_sv_chunks,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_get_root_ids_and_sv_chunks,
                                          multi_args,
                                          n_threads=n_threads)

    root_ids = []
    n_l1_nodes_per_root = []
    rep_l1_nodes = []
    rep_l1_chunk_ids = []
    for result in results:
        root_ids.extend(result[0])
        n_l1_nodes_per_root.extend(result[1])
        rep_l1_nodes.extend(result[2])
        rep_l1_chunk_ids.extend(result[3])

    save_folder = f"{save_dir}/{table_id}/"

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    with h5py.File(f"{save_folder}/root_stats.h5", "w") as f:
        f.create_dataset("root_ids", data=root_ids, compression="gzip")
        f.create_dataset("n_l1_nodes_per_root",
                         data=n_l1_nodes_per_root,
                         compression="gzip")
        f.create_dataset("rep_l1_nodes", data=rep_l1_nodes, compression="gzip")
        f.create_dataset("rep_l1_chunk_ids",
                         data=rep_l1_chunk_ids,
                         compression="gzip")
Пример #28
0
def get_cg():
    if 'cg' not in g:
        table_id = current_app.config['CHUNKGRAPH_TABLE_ID']
        client = get_client(current_app.config)
        g.cg = chunkedgraph.ChunkedGraph(table_id=table_id, client=client)
    return g.cg
Пример #29
0
def _add_layer_thread(args):
    """ Creates abstraction layer """
    table_id, layer_id, chunk_coords, n_threads_per_process = args

    cg = chunkedgraph.ChunkedGraph(table_id=table_id)
    cg.add_layer(layer_id, chunk_coords, n_threads=n_threads_per_process)
Пример #30
0
def get_merge_split_timings(table_id,
                            save_dir=f"{HOME}/benchmarks/",
                            job_size=500,
                            n_threads=1):
    save_folder = f"{save_dir}/{table_id}/"

    merge_edges, merge_edge_weights = load_merge_stats(save_folder)

    probs = merge_edge_weights / np.sum(merge_edge_weights)

    if n_threads == 1:
        n_jobs = n_threads * 3
    else:
        n_jobs = n_threads * 3

    cg = chunkedgraph.ChunkedGraph(table_id)
    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 0:
        del cg_serialized_info["credentials"]

    time_start = time.time()
    order = np.arange(len(merge_edges))

    np.random.seed(np.int(time.time()))

    replace = False

    blocks = np.random.choice(order,
                              job_size * n_jobs,
                              p=probs,
                              replace=replace).reshape(n_jobs, job_size)

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, merge_edges[block]])
    print(f"Building jobs took {time.time()-time_start}s")

    time_start = time.time()
    if n_threads == 1:
        results = mu.multiprocess_func(_get_merge_timings,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_get_merge_timings,
                                          multi_args,
                                          n_threads=n_threads)
    dt = time.time() - time_start

    timings = []
    for result in results:
        timings.extend(result[0])

    percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)]
    mean = np.mean(timings)
    std = np.std(timings)
    median = np.median(timings)

    merge_results = {
        "percentiles": percentiles,
        "p01": percentiles[0],
        "p05": percentiles[4],
        "p95": percentiles[94],
        "p99": percentiles[98],
        "mean": mean,
        "std": std,
        "median": median,
        "total_time_s": dt,
        "job_size": job_size,
        "n_jobs": n_jobs,
        "n_threads": n_threads,
        "replace": replace,
        "requests_per_s": job_size * n_jobs / dt
    }

    timings = []
    for result in results:
        timings.extend(result[1])

    percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)]
    mean = np.mean(timings)
    std = np.std(timings)
    median = np.median(timings)

    split_results = {
        "percentiles": percentiles,
        "p01": percentiles[0],
        "p05": percentiles[4],
        "p95": percentiles[94],
        "p99": percentiles[98],
        "mean": mean,
        "std": std,
        "median": median,
        "total_time_s": dt,
        "job_size": job_size,
        "n_jobs": n_jobs,
        "n_threads": n_threads,
        "replace": replace,
        "requests_per_s": job_size * n_jobs / dt
    }

    return merge_results, split_results