Пример #1
0
def test_save_and_load_dataset__mutable_mapping():
    store: MutableMapping[str, bytes] = {}
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
    save_dataset(ds, store)
    ds2 = load_dataset(store)
    assert_identical(ds, ds2)

    # save and load again to test https://github.com/pydata/xarray/issues/4386
    store2: MutableMapping[str, bytes] = {}
    save_dataset(ds2, store2)
    assert_identical(ds, load_dataset(store2))
Пример #2
0
def test_save_and_load_dataset(tmp_path, is_path):
    path = tmp_path / "ds.zarr"
    if not is_path:
        path = str(path)
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
    save_dataset(ds, path)
    ds2 = load_dataset(path)
    assert_identical(ds, ds2)

    # save and load again to test https://github.com/pydata/xarray/issues/4386
    path2 = tmp_path / "ds2.zarr"
    if not is_path:
        path2 = str(path2)
    save_dataset(ds2, path2)
    assert_identical(ds, load_dataset(path2))
Пример #3
0
def test_DP_field(shared_datadir, tmpdir):
    fields = [
        "variants/CHROM",
        "variants/POS",
        "variants/ID",
        "variants/REF",
        "variants/ALT",
        "calldata/GT",
        "samples",
        # extra
        "calldata/DP",
        "variants/DP",
    ]
    types = {"calldata/DP": "i4"}  # override default of i2
    allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir,
                                              tmpdir,
                                              fields=fields,
                                              types=types)
    allel_ds = sg.read_vcfzarr(allel_vcfzarr_path)

    sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir,
                                        tmpdir,
                                        fields=["INFO/DP", "FORMAT/DP"])
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = sg_ds.drop_vars(
        "call_genotype_phased")  # not included in scikit-allel

    assert_identical(allel_ds, sg_ds)
Пример #4
0
def test_DP_field(shared_datadir, tmpdir):
    fields = [
        "variants/CHROM",
        "variants/POS",
        "variants/ID",
        "variants/REF",
        "variants/ALT",
        "variants/QUAL",
        "calldata/GT",
        "samples",
        # extra
        "calldata/DP",
        "variants/DP",
    ]
    types = {"calldata/DP": "i4"}  # override default of i2
    allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir,
                                              tmpdir,
                                              fields=fields,
                                              types=types)
    allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path)

    sg_vcfzarr_path = create_sg_vcfzarr(
        shared_datadir, tmpdir, fields=["INFO/DP", "FORMAT/DP", "FORMAT/GT"])
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = fix_missing_fields(sg_ds)

    assert_identical(allel_ds, sg_ds)
Пример #5
0
def test_vcf_to_zarr__mixed_ploidy_vcf(
    shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls, regions
):
    path = path_for_test(shared_datadir, "mixed.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        regions=regions,
        chunk_length=5,
        chunk_width=2,
        ploidy=ploidy,
        mixed_ploidy=mixed_ploidy,
        truncate_calls=truncate_calls,
    )
    ds = load_dataset(output)

    variant_dtype = "|S1" if regions else "O"
    assert ds.attrs["contigs"] == ["CHR1", "CHR2", "CHR3"]
    assert_array_equal(ds["variant_contig"], [0, 0])
    assert_array_equal(ds["variant_position"], [2, 7])
    assert_array_equal(
        ds["variant_allele"],
        np.array(
            [
                ["A", "T", "", ""],
                ["A", "C", "", ""],
            ],
            dtype=variant_dtype,
        ),
    )
    assert ds["variant_allele"].dtype == variant_dtype
    assert_array_equal(
        ds["variant_id"],
        np.array([".", "."], dtype=variant_dtype),
    )
    assert ds["variant_id"].dtype == variant_dtype
    assert_array_equal(
        ds["variant_id_mask"],
        [True, True],
    )
    assert_array_equal(ds["sample_id"], ["SAMPLE1", "SAMPLE2", "SAMPLE3"])

    assert ds["call_genotype"].attrs["mixed_ploidy"] == mixed_ploidy
    pad = -2 if mixed_ploidy else -1  # -2 indicates a non-allele
    call_genotype = np.array(
        [
            [[0, 0, 1, 1, pad], [0, 0, pad, pad, pad], [0, 0, 0, 1, pad]],
            [[0, 0, 1, 1, pad], [0, 1, pad, pad, pad], [0, 1, -1, -1, pad]],
        ],
        dtype="i1",
    )
    # truncate row vectors if lower ploidy
    call_genotype = call_genotype[:, :, 0:ploidy]

    assert_array_equal(ds["call_genotype"], call_genotype)
    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
    if mixed_ploidy:
        assert_array_equal(ds["call_genotype_non_allele"], call_genotype < -1)
Пример #6
0
def test_default_fields(shared_datadir, tmpdir):
    allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir)
    allel_ds = sg.read_scikit_allel_vcfzarr(allel_vcfzarr_path)

    sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir)
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = fix_missing_fields(sg_ds)

    assert_identical(allel_ds, sg_ds)
Пример #7
0
def test_default_fields(shared_datadir, tmpdir):
    allel_vcfzarr_path = create_allel_vcfzarr(shared_datadir, tmpdir)
    allel_ds = sg.read_vcfzarr(allel_vcfzarr_path)

    sg_vcfzarr_path = create_sg_vcfzarr(shared_datadir, tmpdir)
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = sg_ds.drop_vars(
        "call_genotype_phased")  # not included in scikit-allel

    assert_identical(allel_ds, sg_ds)
Пример #8
0
def test_vcf_to_zarr__call_genotype_dtype(shared_datadir, tmp_path,
                                          max_alt_alleles, dtype, warning):
    path = path_for_test(shared_datadir, "allele_overflow.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()
    if warning:
        with pytest.warns(MaxAltAllelesExceededWarning):
            vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles)
    else:
        vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles)
    ds = load_dataset(output)
    assert ds.call_genotype.dtype == dtype
    assert ds.call_genotype.values.max() <= max_alt_alleles
Пример #9
0
def test_all_fields(shared_datadir, tmpdir, vcf_file, allel_exclude_fields,
                    sgkit_exclude_fields):
    # change scikit-allel type defaults back to the VCF default
    types = {
        "calldata/DP": "i4",
        "calldata/GQ": "i4",
        "calldata/HQ": "i4",
        "calldata/AD": "i4",
    }
    allel_vcfzarr_path = create_allel_vcfzarr(
        shared_datadir,
        tmpdir,
        vcf_file=vcf_file,
        fields=["*"],
        exclude_fields=allel_exclude_fields,
        types=types,
    )

    field_defs = {
        "INFO/AF": {
            "Number": "A"
        },
        "INFO/AC": {
            "Number": "A"
        },
        "FORMAT/AD": {
            "Number": "R"
        },
        "FORMAT/HQ": {
            "dimension": "haplotypes"
        },
        "FORMAT/SB": {
            "dimension": "strand_biases"
        },
    }
    allel_ds = sg.read_vcfzarr(allel_vcfzarr_path, field_defs=field_defs)

    sg_vcfzarr_path = create_sg_vcfzarr(
        shared_datadir,
        tmpdir,
        vcf_file=vcf_file,
        fields=["INFO/*", "FORMAT/*"],
        exclude_fields=sgkit_exclude_fields,
        field_defs=field_defs,
        truncate_calls=True,
    )
    sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
    sg_ds = sg_ds.drop_vars(
        "call_genotype_phased")  # not included in scikit-allel

    # scikit-allel only records contigs for which there are actual variants,
    # whereas sgkit records contigs from the header
    allel_ds_contigs = set(allel_ds.attrs["contigs"])
    sg_ds_contigs = set(sg_ds.attrs["contigs"])
    assert allel_ds_contigs <= sg_ds_contigs
    del allel_ds.attrs["contigs"]
    del sg_ds.attrs["contigs"]

    if allel_ds_contigs < sg_ds_contigs:
        # variant_contig variables are not comparable, so remove them before comparison
        del allel_ds["variant_contig"]
        del sg_ds["variant_contig"]

    assert_identical(allel_ds, sg_ds)
Пример #10
0
def zarr_to_vcf(
    input: Union[PathType, MutableMapping[str, bytes]],
    output: PathType,
) -> None:
    """Convert a Zarr file to VCF. For test purposes only."""
    ds = load_dataset(input)
    ds = ds.load()

    header_str = ds.attrs["vcf_header"]
    contigs = ds.attrs["contigs"]
    filters = ds.attrs["filters"]

    n_samples = ds.dims["samples"]

    with open(output, mode="w") as out:
        vcf_writer = VcfWriter(out, header_str)

        info_fields = _info_fields(header_str)
        format_fields = _format_fields(header_str)

        for i in range(ds.dims["variants"]):
            chrom = ds.variant_contig[i].values.item()
            pos = ds.variant_position[i].values.item()
            id = ds.variant_id[i].values.item()
            _, ref_alt = array_to_values(ds.variant_allele[i].values)
            ref = ref_alt[0]
            alt = ref_alt[1:]
            _, qual = array_to_values(ds.variant_quality[i].values)
            _, filter_ = array_to_values(ds.variant_filter[i].values)
            if isinstance(filter_, bool):
                filter_ = np.array([filter_])
            if np.all(~filter_):
                filter_ = None
            else:
                filter_ = [filters[i] for i, f in enumerate(filter_) if f]

            info = {}
            samples = [{} for _ in range(n_samples)]  # type: ignore

            for key in info_fields:
                variable_name = f"variant_{key}"
                if variable_name in ds:
                    arr = ds[variable_name][i].values
                    present, val = array_to_values(arr, variable_name)
                    if present:
                        info[key] = val

            for key in format_fields:
                if key == "GT":
                    variable_name = "call_genotype"
                else:
                    variable_name = f"call_{key}"
                if variable_name in ds:
                    arr = ds[variable_name][i].values
                    assert len(arr) == n_samples
                    if key == "GT":
                        phased = ds["call_genotype_phased"][i].values
                    for j in range(len(arr)):
                        present, val = array_to_values(arr[j], variable_name)
                        if not present:
                            break  # samples should all be present or none are
                        if key == "GT":
                            lst = [(str(v) if v is not None else ".")
                                   for v in val]
                            val = ("|" if phased[j] else "/").join(lst)
                        samples[j][key] = val

            variant = VcfVariant(contigs[chrom], pos, id, ref, alt, qual,
                                 filter_, info, samples)

            vcf_writer.write(variant)