Exemplo n.º 1
0
def test_vcf_to_zarr__field_defs(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        fields=["INFO/DP"],
        field_defs={
            "INFO/DP": {
                "Description": "Combined depth across samples"
            }
        },
    )
    ds = xr.open_zarr(output)

    assert_array_equal(ds["variant_DP"], [-1, -1, 14, 11, 10, 13, 9, -1, -1])
    assert ds["variant_DP"].attrs["comment"] == "Combined depth across samples"

    vcf_to_zarr(
        path,
        output,
        fields=["INFO/DP"],
        field_defs={"INFO/DP": {
            "Description": ""
        }},  # blank description
    )
    ds = xr.open_zarr(output)

    assert_array_equal(ds["variant_DP"], [-1, -1, 14, 11, 10, 13, 9, -1, -1])
    assert "comment" not in ds["variant_DP"].attrs
Exemplo n.º 2
0
def test_vcf_to_zarr__field_number_fixed(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    # HQ Number is 2, and a dimension is automatically assigned (FORMAT_HQ_dim)
    vcf_to_zarr(
        path,
        output,
        fields=["FORMAT/HQ"],
    )
    ds = xr.open_zarr(output)

    missing, fill = INT_MISSING, INT_FILL
    assert_array_equal(
        ds["call_HQ"],
        [
            [[10, 15], [10, 10], [3, 3]],
            [[10, 10], [10, 10], [3, 3]],
            [[51, 51], [51, 51], [missing, missing]],
            [[58, 50], [65, 3], [missing, missing]],
            [[23, 27], [18, 2], [missing, missing]],
            [[56, 60], [51, 51], [missing, missing]],
            [[fill, fill], [fill, fill], [fill, fill]],
            [[fill, fill], [fill, fill], [fill, fill]],
            [[fill, fill], [fill, fill], [fill, fill]],
        ],
    )
    assert ds["call_HQ"].dims == ("variants", "samples", "FORMAT_HQ_dim")
    assert ds["call_HQ"].attrs["comment"] == "Haplotype Quality"
Exemplo n.º 3
0
def test_vcf_to_zarr__field_number_R(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        fields=["FORMAT/AD"],
        field_defs={"FORMAT/AD": {
            "Number": "R"
        }},
    )
    ds = xr.open_zarr(output)

    # select a small region to check
    ds = ds.set_index(variants="variant_position").sel(
        variants=slice(10002764, 10002793))

    ad = np.array([
        [[40, 14, 0, -1]],
        [[-1, -1, -1, -1]],
        [[65, 8, 5, 0]],
        [[-1, -1, -1, -1]],
    ], )
    assert_array_equal(ds["call_AD"], ad)
    assert (ds["call_AD"].attrs["comment"] ==
            "Allelic depths for the ref and alt alleles in the order listed")
Exemplo n.º 4
0
def test_vcf_to_zarr__parallel(shared_datadir, is_path, output_is_path,
                               concat_algorithm, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output = tmp_path.joinpath("vcf_concat.zarr")
    if not output_is_path:
        output = output.as_posix()

    regions = ["20", "21"]

    vcf_to_zarr(
        path,
        output,
        regions=regions,
        chunk_length=5_000,
        concat_algorithm=concat_algorithm,
    )
    ds = xr.open_zarr(output)

    assert ds["sample_id"].shape == (1, )
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910, )
    assert ds["variant_id"].shape == (19910, )
    assert ds["variant_id_mask"].shape == (19910, )
    assert ds["variant_position"].shape == (19910, )

    if concat_algorithm is None:
        assert ds["variant_allele"].dtype == "O"
        assert ds["variant_id"].dtype == "O"
    else:
        assert ds["variant_allele"].dtype == "S48"
        assert ds["variant_id"].dtype == "S1"
Exemplo n.º 5
0
def test_vcf_to_zarr__field_number_fixed(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    # HQ Number is 2
    vcf_to_zarr(
        path,
        output,
        fields=["FORMAT/HQ"],
        field_defs={"FORMAT/HQ": {
            "dimension": "haplotypes"
        }},
    )
    ds = xr.open_zarr(output)

    assert_array_equal(
        ds["call_HQ"],
        [
            [[10, 15], [10, 10], [3, 3]],
            [[10, 10], [10, 10], [3, 3]],
            [[51, 51], [51, 51], [-1, -1]],
            [[58, 50], [65, 3], [-1, -1]],
            [[23, 27], [18, 2], [-1, -1]],
            [[56, 60], [51, 51], [-1, -1]],
            [[-1, -1], [-1, -1], [-1, -1]],
            [[-1, -1], [-1, -1], [-1, -1]],
            [[-1, -1], [-1, -1], [-1, -1]],
        ],
    )
    assert ds["call_HQ"].attrs["comment"] == "Haplotype Quality"
Exemplo n.º 6
0
def test_vcf_to_zarr__parallel_with_fields(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()
    regions = ["20", "21"]

    vcf_to_zarr(
        path,
        output,
        regions=regions,
        chunk_length=5_000,
        temp_chunk_length=2_500,
        fields=["INFO/MQ", "FORMAT/PGT"],
    )
    ds = xr.open_zarr(output)

    # select a small region to check
    ds = ds.set_index(variants=("variant_contig", "variant_position")).sel(
        variants=slice((0, 10001661), (0, 10001670)))
    assert_allclose(ds["variant_MQ"], [58.33, np.nan, 57.45])
    assert ds["variant_MQ"].attrs["comment"] == "RMS Mapping Quality"

    assert_array_equal(ds["call_PGT"], [["0|1"], [""], ["0|1"]])
    assert (
        ds["call_PGT"].attrs["comment"] ==
        "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another"
    )
Exemplo n.º 7
0
def test_vcf_to_zarr__parallel_temp_chunk_length(shared_datadir, is_path,
                                                 tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = ["20", "21"]

    # Use a temp_chunk_length that is smaller than chunk_length
    vcf_to_zarr(path,
                output,
                regions=regions,
                chunk_length=5_000,
                temp_chunk_length=2_500)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1, )
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910, )
    assert ds["variant_id"].shape == (19910, )
    assert ds["variant_id_mask"].shape == (19910, )
    assert ds["variant_position"].shape == (19910, )

    assert ds["variant_allele"].dtype == "S48"
    assert ds["variant_id"].dtype == "S1"
Exemplo n.º 8
0
def test_vcf_to_zarr__mixed_ploidy_vcf(
    shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls, regions
):
    path = path_for_test(shared_datadir, "mixed.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        regions=regions,
        chunk_length=5,
        chunk_width=2,
        ploidy=ploidy,
        mixed_ploidy=mixed_ploidy,
        truncate_calls=truncate_calls,
    )
    ds = load_dataset(output)

    variant_dtype = "|S1" if regions else "O"
    assert ds.attrs["contigs"] == ["CHR1", "CHR2", "CHR3"]
    assert_array_equal(ds["variant_contig"], [0, 0])
    assert_array_equal(ds["variant_position"], [2, 7])
    assert_array_equal(
        ds["variant_allele"],
        np.array(
            [
                ["A", "T", "", ""],
                ["A", "C", "", ""],
            ],
            dtype=variant_dtype,
        ),
    )
    assert ds["variant_allele"].dtype == variant_dtype
    assert_array_equal(
        ds["variant_id"],
        np.array([".", "."], dtype=variant_dtype),
    )
    assert ds["variant_id"].dtype == variant_dtype
    assert_array_equal(
        ds["variant_id_mask"],
        [True, True],
    )
    assert_array_equal(ds["sample_id"], ["SAMPLE1", "SAMPLE2", "SAMPLE3"])

    assert ds["call_genotype"].attrs["mixed_ploidy"] == mixed_ploidy
    pad = -2 if mixed_ploidy else -1  # -2 indicates a non-allele
    call_genotype = np.array(
        [
            [[0, 0, 1, 1, pad], [0, 0, pad, pad, pad], [0, 0, 0, 1, pad]],
            [[0, 0, 1, 1, pad], [0, 1, pad, pad, pad], [0, 1, -1, -1, pad]],
        ],
        dtype="i1",
    )
    # truncate row vectors if lower ploidy
    call_genotype = call_genotype[:, :, 0:ploidy]

    assert_array_equal(ds["call_genotype"], call_genotype)
    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
    if mixed_ploidy:
        assert_array_equal(ds["call_genotype_non_allele"], call_genotype < -1)
Exemplo n.º 9
0
def test_vcf_to_zarr__parallel_compressor_and_filters(shared_datadir, is_path,
                                                      tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = ["20", "21"]

    default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
    variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
    encoding = dict(
        variant_id=dict(compressor=variant_id_compressor),
        variant_id_mask=dict(filters=None),
    )
    vcf_to_zarr(
        path,
        output,
        regions=regions,
        chunk_length=5_000,
        compressor=default_compressor,
        encoding=encoding,
    )

    # look at actual Zarr store to check compressor and filters
    z = zarr.open(output)
    assert z["call_genotype"].compressor == default_compressor
    assert z["call_genotype"].filters is None
    assert z["call_genotype_mask"].filters == [PackBits()]

    assert z["variant_id"].compressor == variant_id_compressor
    assert z["variant_id_mask"].filters is None
Exemplo n.º 10
0
def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path,
                                             tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
    variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
    encoding = dict(
        variant_id=dict(compressor=variant_id_compressor),
        variant_id_mask=dict(filters=None),
    )
    vcf_to_zarr(
        path,
        output,
        chunk_length=5,
        chunk_width=2,
        compressor=default_compressor,
        encoding=encoding,
    )

    # look at actual Zarr store to check compressor and filters
    z = zarr.open(output)
    assert z["call_genotype"].compressor == default_compressor
    assert z["call_genotype"].filters is None
    assert z["call_genotype_mask"].filters == [PackBits()]

    assert z["variant_id"].compressor == variant_id_compressor
    assert z["variant_id_mask"].filters is None
Exemplo n.º 11
0
def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    with pytest.warns(MaxAltAllelesExceededWarning):
        max_alt_alleles = 1
        vcf_to_zarr(path,
                    output,
                    chunk_length=5,
                    chunk_width=2,
                    max_alt_alleles=max_alt_alleles)
        ds = xr.open_zarr(output)

        # extra alt alleles are dropped
        assert_array_equal(
            ds["variant_allele"],
            [
                ["A", "C"],
                ["A", "G"],
                ["G", "A"],
                ["T", "A"],
                ["A", "G"],
                ["T", ""],
                ["G", "GA"],
                ["T", ""],
                ["AC", "A"],
            ],
        )

        # genotype calls are truncated
        assert np.all(ds["call_genotype"].values <= max_alt_alleles)

        # the maximum number of alt alleles actually seen is stored as an attribute
        assert ds.attrs["max_alt_alleles_seen"] == 3
Exemplo n.º 12
0
def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        max_alt_alleles=2,
        fields=["INFO/AC"],
        field_defs={"INFO/AC": {
            "Number": "A"
        }},
    )
    ds = xr.open_zarr(output)

    assert_array_equal(
        ds["variant_AC"],
        [
            [-1, -1],
            [-1, -1],
            [-1, -1],
            [-1, -1],
            [-1, -1],
            [-1, -1],
            [3, 1],
            [-1, -1],
            [-1, -1],
        ],
    )
    assert (
        ds["variant_AC"].attrs["comment"] ==
        "Allele count in genotypes, for each ALT allele, in the same order as listed"
    )
Exemplo n.º 13
0
def test_vcf_to_zarr__filter_not_defined_in_header(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "no_filter_defined.vcf")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    with pytest.raises(
            ValueError,
            match=r"Filter 'FAIL' is not defined in the header.",
    ):
        vcf_to_zarr(path, output)
Exemplo n.º 14
0
def test_vcf_to_zarr__no_genotypes(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "no_genotypes.vcf")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    with pytest.raises(
            ValueError,
            match=r"Genotype information missing from VCF.",
    ):
        vcf_to_zarr(path, output)
Exemplo n.º 15
0
def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path):
    # sample.vcf does not define the contigs in the header, and isn't indexed
    path = path_for_test(shared_datadir, "sample.vcf")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    with pytest.raises(
            ValueError,
            match=r"Contig '19' is not defined in the header.",
    ):
        vcf_to_zarr(path, output)
Exemplo n.º 16
0
def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
    path = path_for_test(
        shared_datadir,
        "mixed.vcf",
    )
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output, truncate_calls=True)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]
    assert ds["sample_id"].shape == (3, )
Exemplo n.º 17
0
def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir,
                         "Homo_sapiens_assembly38.headerOnly.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output)

    ds = xr.open_zarr(output)

    assert len(ds.attrs["contigs"]) == 3366
    assert ds[
        "variant_contig"].dtype == np.int16  # needs larger dtype than np.int8
Exemplo n.º 18
0
def test_vcf_to_zarr__call_genotype_dtype(shared_datadir, tmp_path,
                                          max_alt_alleles, dtype, warning):
    path = path_for_test(shared_datadir, "allele_overflow.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()
    if warning:
        with pytest.warns(MaxAltAllelesExceededWarning):
            vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles)
    else:
        vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles)
    ds = load_dataset(output)
    assert ds.call_genotype.dtype == dtype
    assert ds.call_genotype.values.max() <= max_alt_alleles
Exemplo n.º 19
0
def create_sg_vcfzarr(
    shared_datadir: Path,
    tmpdir: Path,
    *,
    vcf_file: str = "sample.vcf.gz",
    **kwargs: Any,
) -> Path:
    """Create a vcfzarr file using sgkit"""
    vcf_path = shared_datadir / vcf_file
    output_path = tmpdir / f"sg_{vcf_file}.zarr"
    vcf_to_zarr(vcf_path, str(output_path), **kwargs)
    return output_path
Exemplo n.º 20
0
def test_vcf_to_zarr__parallel_partitioned(shared_datadir, is_path, tmp_path):
    path = path_for_test(
        shared_datadir,
        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
        is_path,
    )
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    regions = partition_into_regions(path, num_parts=4)

    vcf_to_zarr(path, output, regions=regions, chunk_length=1_000, chunk_width=1_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (2535,)
    assert ds["variant_id"].shape == (1406,)
Exemplo n.º 21
0
def test_vcf_to_zarr__mixed_ploidy_vcf_exception(
    shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls
):
    path = path_for_test(shared_datadir, "mixed.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    with pytest.raises(ValueError) as excinfo:
        vcf_to_zarr(
            path,
            output,
            ploidy=ploidy,
            mixed_ploidy=mixed_ploidy,
            truncate_calls=truncate_calls,
        )
    assert "Genotype call longer than ploidy." == str(excinfo.value)
Exemplo n.º 22
0
def test_vcf_to_zarr__parallel_temp_chunk_length_not_divisible(
    shared_datadir, tmp_path
):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", False)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = ["20", "21"]

    with pytest.raises(
        ValueError,
        match=r"Temporary chunk length in variant dimension \(4000\) must evenly divide target chunk length 5000",
    ):
        # Use a temp_chunk_length that does not divide into chunk_length
        vcf_to_zarr(
            path, output, regions=regions, chunk_length=5_000, temp_chunk_length=4_000
        )
Exemplo n.º 23
0
def test_vcf_to_zarr__field_number_G_non_diploid(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir,
                         "simple.output.mixed_depth.likelihoods.vcf")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path,
                output,
                ploidy=4,
                max_alt_alleles=3,
                fields=["FORMAT/GL"])
    ds = xr.open_zarr(output)

    # comb(n_alleles + ploidy - 1, ploidy) = comb(4 + 4 - 1, 4) = comb(7, 4) = 35
    assert_array_equal(ds["call_GL"].shape, (4, 3, 35))
    assert ds["call_GL"].attrs["comment"] == "Genotype likelihoods"
Exemplo n.º 24
0
def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
    shared_datadir, is_path, tmp_path
):
    paths = [
        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
    ]
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    # invalid regions, should be a sequence of sequences
    regions = partition_into_regions(paths[0], num_parts=2)

    with pytest.raises(
        ValueError,
        match=r"multiple input regions must be a sequence of sequence of strings",
    ):
        vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
Exemplo n.º 25
0
def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "sample.vcf.gz")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(
        path,
        output,
        chunk_length=5,
        chunk_width=2,
        fields=["INFO/DP", "INFO/AA", "INFO/DB", "FORMAT/DP"],
    )
    ds = xr.open_zarr(output)

    missing, fill = INT_MISSING, INT_FILL
    assert_array_equal(ds["variant_DP"],
                       [fill, fill, 14, 11, 10, 13, 9, fill, fill])
    assert ds["variant_DP"].attrs["comment"] == "Total Depth"

    assert_array_equal(
        ds["variant_AA"],
        np.array(["", "", "", "", "T", "T", "G", "", ""], dtype="O"),
    )
    assert ds["variant_AA"].attrs["comment"] == "Ancestral Allele"

    assert_array_equal(
        ds["variant_DB"],
        [False, False, True, False, True, False, False, False, False])
    assert ds["variant_DB"].attrs["comment"] == "dbSNP membership, build 129"

    dp = np.array(
        [
            [fill, fill, fill],
            [fill, fill, fill],
            [1, 8, 5],
            [3, 5, 3],
            [6, 0, 4],
            [missing, 4, 2],
            [4, 2, 3],
            [fill, fill, fill],
            [fill, fill, fill],
        ],
        dtype="i4",
    )
    assert_array_equal(ds["call_DP"], dp)
    assert ds["call_DP"].attrs["comment"] == "Read Depth"
Exemplo n.º 26
0
def test_vcf_to_zarr__empty_region(shared_datadir, is_path, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
                         is_path)
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
    regions = "23"

    vcf_to_zarr(path, output, regions=regions)
    ds = xr.open_zarr(output)

    assert ds["sample_id"].shape == (1, )
    assert ds["call_genotype"].shape == (0, 1, 2)
    assert ds["call_genotype_mask"].shape == (0, 1, 2)
    assert ds["call_genotype_phased"].shape == (0, 1)
    assert ds["variant_allele"].shape == (0, 4)
    assert ds["variant_contig"].shape == (0, )
    assert ds["variant_id"].shape == (0, )
    assert ds["variant_id_mask"].shape == (0, )
    assert ds["variant_position"].shape == (0, )
Exemplo n.º 27
0
def test_vcf_to_zarr__parallel_partitioned_by_size(shared_datadir, is_path,
                                                   tmp_path):
    path = path_for_test(
        shared_datadir,
        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
        is_path,
    )
    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()

    vcf_to_zarr(path,
                output,
                target_part_size="4MB",
                chunk_length=1_000,
                chunk_width=1_000)
    ds = xr.open_zarr(output)

    assert ds["sample_id"].shape == (2535, )
    assert ds["variant_id"].shape == (1406, )
Exemplo n.º 28
0
def test_vcf_to_zarr__no_genotypes(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "no_genotypes.vcf")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output)

    ds = xr.open_zarr(output)

    assert "call_genotype" not in ds
    assert "call_genotype_mask" not in ds
    assert "call_genotype_phased" not in ds

    assert ds["sample_id"].shape == (0, )
    assert ds["variant_allele"].shape == (26, 4)
    assert ds["variant_contig"].shape == (26, )
    assert ds["variant_id"].shape == (26, )
    assert ds["variant_id_mask"].shape == (26, )
    assert ds["variant_position"].shape == (26, )
Exemplo n.º 29
0
def test_vcf_to_zarr__no_genotypes_with_gt_header(shared_datadir, tmp_path):
    path = path_for_test(shared_datadir, "no_genotypes_with_gt_header.vcf")
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output)

    ds = xr.open_zarr(output)

    assert_array_equal(ds["call_genotype"], -1)
    assert_array_equal(ds["call_genotype_mask"], 1)
    assert_array_equal(ds["call_genotype_phased"], 0)

    assert ds["sample_id"].shape == (0, )
    assert ds["variant_allele"].shape == (26, 4)
    assert ds["variant_contig"].shape == (26, )
    assert ds["variant_id"].shape == (26, )
    assert ds["variant_id_mask"].shape == (26, )
    assert ds["variant_position"].shape == (26, )
Exemplo n.º 30
0
def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path):
    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
    output = tmp_path.joinpath("vcf.zarr").as_posix()

    vcf_to_zarr(path, output, chunk_length=5_000)
    ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]

    assert ds["sample_id"].shape == (1,)
    assert ds["call_genotype"].shape == (19910, 1, 2)
    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
    assert ds["call_genotype_phased"].shape == (19910, 1)
    assert ds["variant_allele"].shape == (19910, 4)
    assert ds["variant_contig"].shape == (19910,)
    assert ds["variant_id"].shape == (19910,)
    assert ds["variant_id_mask"].shape == (19910,)
    assert ds["variant_position"].shape == (19910,)

    assert ds["variant_allele"].dtype == "O"
    assert ds["variant_id"].dtype == "O"