예제 #1
0
def test_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 4

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2

    # Too many partitions still produces results
    cfg = tiledbvcf.ReadConfig(region_partition=(1, 3))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(region_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
예제 #2
0
def test_sample_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-18000'])
    assert len(df) == 11
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-18000'])
    assert len(df) == 3
    assert (df.sample_name == 'HG01762').all()

    # Error: too many partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    with pytest.raises(RuntimeError):
        df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                     regions=['1:12000-18000'])

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
예제 #3
0
def test_sample_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")

    cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(attrs=["sample_name", "pos_start", "pos_end"],
                 regions=["1:12000-18000"])
    assert len(df) == 11
    assert (df.sample_name == "HG00280").all()

    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(attrs=["sample_name", "pos_start", "pos_end"],
                 regions=["1:12000-18000"])
    assert len(df) == 3
    assert (df.sample_name == "HG01762").all()

    # Error: too many partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    with pytest.raises(RuntimeError):
        df = ds.read(attrs=["sample_name", "pos_start", "pos_end"],
                     regions=["1:12000-18000"])

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
예제 #4
0
def test_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 4

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2

    # Too many partitions still produces results
    cfg = tiledbvcf.ReadConfig(region_partition=(1, 3))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(region_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
예제 #5
0
def test_tbb_threads_config():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=3"])
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    cfg = tiledbvcf.ReadConfig(tiledb_config=["sm.num_tbb_threads=4"])
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
예제 #6
0
def test_tbb_threads_config():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=3'])
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=4'])
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
예제 #7
0
def test_read_config():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig()
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    cfg = tiledbvcf.ReadConfig(memory_budget_mb=512,
                               region_partition=(0, 3),
                               tiledb_config=['sm.tile_cache_size=0',
                                              'sm.num_reader_threads=1'])
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    with pytest.raises(TypeError):
        cfg = tiledbvcf.ReadConfig(abc=123)
예제 #8
0
def test_read_config():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig()
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    cfg = tiledbvcf.ReadConfig(
        memory_budget_mb=512,
        region_partition=(0, 3),
        tiledb_config=["sm.tile_cache_size=0", "sm.compute_concurrency_level=1"],
    )
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    with pytest.raises(TypeError):
        cfg = tiledbvcf.ReadConfig(abc=123)
예제 #9
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    df = test_ds.read(attrs=["pos_end"], regions=["1:12700-13400"])
    assert not test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(
        pd.DataFrame.from_dict(
            {"pos_end": np.array([12771, 12771], dtype=np.int32)}),
        df,
    )

    df = test_ds.continue_read()
    assert not test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(
        pd.DataFrame.from_dict(
            {"pos_end": np.array([13374, 13389], dtype=np.int32)}),
        df,
    )

    df = test_ds.continue_read()
    assert test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(
        pd.DataFrame.from_dict(
            {"pos_end": np.array([13395, 13413], dtype=np.int32)}),
        df,
    )
예제 #10
0
def test_read_limit():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(limit=3)
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end',
                        'fmt_DP', 'fmt_PL'],
                 regions=['1:12100-13360', '1:13500-17350'])
    assert len(df) == 3
예제 #11
0
def test_read_limit():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(limit=3)
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end", "fmt_DP", "fmt_PL"],
        regions=["1:12100-13360", "1:13500-17350"],
    )
    assert len(df) == 3
예제 #12
0
def test_sample_and_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2
    assert (df.sample_name == "HG00280").all()

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2
    assert (df.sample_name == "HG01762").all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 2
    assert (df.sample_name == "HG00280").all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)
    df = ds.read(
        attrs=["sample_name", "pos_start", "pos_end"],
        regions=["1:12000-13000", "1:17000-18000"],
    )
    assert len(df) == 0
예제 #13
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/v3/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode='r', cfg=cfg)

    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(
            ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280',
             'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280',
             'HG00280', 'HG00280', 'HG00280', 'HG00280']),
            'pos_start': pd.Series(
                [12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396,
                 13414, 13452, 13520, 13545, 17319, 17480], dtype=np.int32),
            'pos_end': pd.Series(
                [12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413,
                 13451, 13519, 13544, 13689, 17479, 17486], dtype=np.int32)})

    # Region partitions
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10)
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Sample partitions (we have to sort to check the result)
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                sample_partitions=2)
    df = dask_df.compute().sort_values('sample_name').reset_index(drop=True)
    _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True),
               df)

    # Both
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10, sample_partitions=2)
    df = dask_df.compute().sort_values('sample_name').reset_index(drop=True)
    _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True),
               df)

    # No partitioning
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'])
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Subset of partitions (limit_partitions)
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10, sample_partitions=2,
                                limit_partitions=2)
    assert dask_df.npartitions == 2
예제 #14
0
def test_incomplete_read_generator():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    overall_df = None
    for df in test_ds.read_iter(attrs=['pos_end'], regions=['1:12700-13400']):
        if overall_df is None:
            overall_df = df
        else:
            overall_df = overall_df.append(df, ignore_index=True)

    assert len(overall_df) == 6
    _check_dfs(pd.DataFrame.from_dict(
        {'pos_end': np.array([12771, 12771, 13374, 13389, 13395, 13413],
                             dtype=np.int32)}), overall_df)
예제 #15
0
def test_map_incomplete():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(['HG00280', 'HG01762']),
         'pos_start': pd.Series([12141, 12141], dtype=np.int32),
         'pos_end': pd.Series([12277, 12277], dtype=np.int32)})

    # Region partitions
    dask_df = test_ds.map_dask(lambda df: df[df.pos_start * 2 < 25000],
                               attrs=['sample_name', 'pos_start', 'pos_end'],
                               region_partitions=10)
    df = dask_df.compute()
    _check_dfs(expected_df, df)
예제 #16
0
def test_map_incomplete():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    expected_df = pd.DataFrame(
        {
            "sample_name": pd.Series(["HG00280", "HG01762"]),
            "pos_start": pd.Series([12141, 12141], dtype=np.int32),
            "pos_end": pd.Series([12277, 12277], dtype=np.int32),
        }
    )

    # Region partitions
    dask_df = test_ds.map_dask(
        lambda df: df[df.pos_start * 2 < 25000],
        attrs=["sample_name", "pos_start", "pos_end"],
        region_partitions=10,
    )
    df = dask_df.compute()
    _check_dfs(expected_df, df)
예제 #17
0
def test_ingest_disable_merging(tmp_path):
    # Create the dataset
    uri = os.path.join(tmp_path, "dataset_disable_merging")

    cfg = tiledbvcf.ReadConfig(memory_budget_mb=1024)
    attrs = ["sample_name", "contig", "pos_start", "pos_end"]

    ds = tiledbvcf.Dataset(uri, mode="w")
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"]
    ]
    ds.create_dataset()
    ds.ingest_samples(samples, contig_fragment_merging=False)

    # Open it back in read mode and check some queries
    ds = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=False)
    df = ds.read(attrs=attrs)
    assert ds.count() == 246
    assert ds.count(regions=["chrX:9032893-9032893"]) == 1

    # Create the dataset
    uri = os.path.join(tmp_path, "dataset_merging_separate")
    ds2 = tiledbvcf.Dataset(uri, mode="w", verbose=True)
    samples = [
        os.path.join(TESTS_INPUT_DIR, s) for s in ["v2-DjrIAzkP-downsampled.vcf.gz"]
    ]
    ds2.create_dataset()
    ds2.ingest_samples(samples, contigs_to_keep_separate=["chr1"])

    # Open it back in read mode and check some queries
    ds2 = tiledbvcf.Dataset(uri, cfg=cfg, mode="r", verbose=True)
    df2 = ds2.read(attrs=attrs)
    print(df.equals(df2))
    assert df.equals(df2)

    assert ds.count() == 246
    assert ds.count(regions=["chrX:9032893-9032893"]) == 1
예제 #18
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, "arrays/v3/ingested_2samples")
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.Dataset(uri, mode="r", cfg=cfg)

    expected_df = pd.DataFrame({
        "sample_name":
        pd.Series([
            "HG00280",
            "HG01762",
            "HG00280",
            "HG01762",
            "HG00280",
            "HG01762",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
            "HG00280",
        ]),
        "pos_start":
        pd.Series(
            [
                12141,
                12141,
                12546,
                12546,
                13354,
                13354,
                13375,
                13396,
                13414,
                13452,
                13520,
                13545,
                17319,
                17480,
            ],
            dtype=np.int32,
        ),
        "pos_end":
        pd.Series(
            [
                12277,
                12277,
                12771,
                12771,
                13374,
                13389,
                13395,
                13413,
                13451,
                13519,
                13544,
                13689,
                17479,
                17486,
            ],
            dtype=np.int32,
        ),
    })

    # Region partitions
    dask_df = test_ds.read_dask(  # pylint:disable=no-member
        attrs=["sample_name", "pos_start", "pos_end"],
        region_partitions=10)  # pylint:disable=no-member
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Sample partitions (we have to sort to check the result)
    dask_df = test_ds.read_dask(attrs=["sample_name", "pos_start", "pos_end"],
                                sample_partitions=2)  # pylint:disable=no-member
    df = dask_df.compute().sort_values("sample_name").reset_index(drop=True)
    _check_dfs(
        expected_df.sort_values("sample_name").reset_index(drop=True), df)

    # Both
    dask_df = test_ds.read_dask(
        attrs=["sample_name", "pos_start", "pos_end"],
        region_partitions=10,
        sample_partitions=2,
    )  # pylint:disable=no-member
    df = dask_df.compute().sort_values("sample_name").reset_index(drop=True)
    _check_dfs(
        expected_df.sort_values("sample_name").reset_index(drop=True), df)

    # No partitioning
    dask_df = test_ds.read_dask(attrs=["sample_name", "pos_start", "pos_end"])  # pylint:disable=no-member
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Subset of partitions (limit_partitions)
    dask_df = test_ds.read_dask(
        attrs=["sample_name", "pos_start", "pos_end"],
        region_partitions=10,
        sample_partitions=2,
        limit_partitions=2,
    )  # pylint:disable=no-member
    assert dask_df.npartitions == 2
예제 #19
-1
def test_sample_and_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG01762').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 0