Пример #1
0
def read_hierarchy(path):
    """
    TODO
    """
    children = {}
    min_sample = MAX_SAMPLE
    max_sample = -MAX_SAMPLE
    num_bundles = 0
    with cd(path):
        with open("sample_index.txt", "r") as _file:
            token = _file.readline()
            while token:
                parsed_token = parse(
                    "{type}:{ID}\tname:{name}\tsamples:[{min:d},{max:d})\n",
                    token)
                if parsed_token["type"] == "DIR":
                    subhierarchy = read_hierarchy(parsed_token["name"])
                    subhierarchy.address = parsed_token["ID"]
                    num_bundles += subhierarchy.num_bundles
                    children[parsed_token["ID"]] = subhierarchy
                if parsed_token["type"] == "BUNDLE":
                    children[parsed_token["ID"]] = SampleIndex(
                        parsed_token["min"],
                        parsed_token["max"],
                        {},
                        parsed_token["name"],
                        address=parsed_token["ID"],
                    )
                    num_bundles += 1
                min_sample = min(min_sample, parsed_token["min"])
                max_sample = max(max_sample, parsed_token["max"])
                token = _file.readline()
    top_index = SampleIndex(min_sample,
                            max_sample,
                            children,
                            path,
                            leafid=-1,
                            num_bundles=num_bundles)
    return top_index
Пример #2
0
def create_hierarchy_from_max_sample(
    max_sample,
    bundle_size,
    directory_sizes=None,
    root=".",
    start_bundle_id=0,
    min_sample=0,
    address="",
    n_digits=1,
):
    """"
    Construct the SampleIndex based off the total number of samples and the
    chunking size at each depth in the hierarchy.

    This method will add new SampleIndex objects as this SampleIndex's
    children if directory_sizes is not the empty set.

    :param max_sample: The max Sample ID this hierarchy is responsible for.
    :bundle_size: The max number of samples a bundle file is responsible for.
    :directory_sizes: The number of samples each directory is responsible
        for - a list, one value for each level in the directory hierarchy.
    :bundle_id: The current bundle_id count.
    :min_sample: The start of the sample count.
    :n_digits: The number of digits to pad the directories with
    """
    if directory_sizes is None:
        directory_sizes = []

    # The dict of children nodes.
    children = {}
    # The child_id. Used for naming directory children.
    child_id = 0
    # The bundle_id - will increment as we add children.
    bundle_id = start_bundle_id
    # Number of samples the child node is responsible for.
    num_samples_per_child = directory_sizes[
        0] if directory_sizes else bundle_size

    if address:
        address_prefix = address + "."
    else:
        address_prefix = address

    for i in range(min_sample, max_sample, num_samples_per_child):
        child_min_sample_id = i
        child_max_sample_id = min(i + num_samples_per_child, max_sample)

        child_dir = f"{child_id}".zfill(n_digits)
        child_address = address_prefix + child_dir

        if directory_sizes:
            # Append an SampleIndex sub-hierarchy child.
            children[child_address] = create_hierarchy_from_max_sample(
                child_max_sample_id,
                bundle_size,
                directory_sizes[1:],
                root=child_dir,
                min_sample=child_min_sample_id,
                start_bundle_id=bundle_id,
                address=child_address,
                n_digits=n_digits,
            )

            bundle_id += children[child_address].num_bundles
        else:
            # Append a bundle file child.
            children[child_address] = SampleIndex(
                child_min_sample_id,
                child_max_sample_id,
                {},
                f"samples{child_min_sample_id}-{child_max_sample_id}.ext",
                leafid=bundle_id,
                address=child_address,
            )
            bundle_id += 1
        child_id += 1
    num_bundles = bundle_id - start_bundle_id
    return SampleIndex(min_sample,
                       max_sample,
                       children,
                       root,
                       num_bundles=num_bundles,
                       address=address)