示例#1
0
def test_sample_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-18000'])
    assert len(df) == 11
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-18000'])
    assert len(df) == 3
    assert (df.sample_name == 'HG01762').all()

    # Error: too many partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    with pytest.raises(RuntimeError):
        df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                     regions=['1:12000-18000'])

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
示例#2
0
def test_read_null_attrs(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='w')
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in
               ['small3.bcf', 'small.bcf']]
    ds.ingest_samples(samples)

    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r')
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end',
                        'info_BaseQRankSum', 'info_DP', 'fmt_DP'],
                 regions=['1:12700-13400', '1:69500-69800'])
    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(
            ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280',
             'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280']),
            'pos_start': pd.Series(
                [12546, 12546, 13354, 13354, 13375, 13396, 69371, 69511, 69512,
                 69761, 69762, 69771], dtype=np.int32),
            'pos_end': pd.Series(
                [12771, 12771, 13374, 13389, 13395, 13413, 69510, 69511, 69760,
                 69761, 69770, 69834], dtype=np.int32),
            'info_BaseQRankSum': pd.Series(
                [None, None, None, None, None, None, None,
                 np.array([-0.787], dtype=np.float32), None,
                 np.array([1.97], dtype=np.float32), None, None]),
            'info_DP': pd.Series([None, None, None, None, None, None, None,
                                  np.array([89], dtype=np.int32), None,
                                  np.array([24], dtype=np.int32), None, None]),
            'fmt_DP': pd.Series(map(lambda lst: np.array(lst, dtype=np.int32),
                                    [[0], [0], [15], [64], [6], [2], [180],
                                     [88], [97], [24], [23], [21]]))})
    _check_dfs(expected_df, df)
示例#3
0
def test_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 4

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2

    # Too many partitions still produces results
    cfg = tiledbvcf.ReadConfig(region_partition=(1, 3))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2

    # Error: index >= num partitions
    cfg = tiledbvcf.ReadConfig(region_partition=(2, 2))
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
示例#4
0
def test_tbb_threads_config():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=3'])
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=4'])
    with pytest.raises(RuntimeError):
        ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
示例#5
0
def test_incremental_ingest(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='w')
    ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small.bcf')])
    ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small2.bcf')])

    # Open it back in read mode and check some queries
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r')
    assert ds.count() == 14
    assert ds.count(regions=['1:12700-13400']) == 6
    assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
示例#6
0
def test_read_write_mode_exceptions():
    ds = tiledbvcf.TileDBVCFDataset(
        os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples'))
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in
               ['small.bcf', 'small2.bcf']]
    with pytest.raises(Exception):
        ds.ingest_samples(samples)

    ds = tiledbvcf.TileDBVCFDataset(
        os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples'), mode='w')
    with pytest.raises(Exception):
        ds.count()
示例#7
0
def test_basic_ingest(tmp_path):
    # Create the dataset
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='w')
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in
               ['small.bcf', 'small2.bcf']]
    ds.ingest_samples(samples)

    # Open it back in read mode and check some queries
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r')
    assert ds.count() == 14
    assert ds.count(regions=['1:12700-13400']) == 6
    assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
示例#8
0
def test_read_config():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig()
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    cfg = tiledbvcf.ReadConfig(memory_budget_mb=512,
                               region_partition=(0, 3),
                               tiledb_config=['sm.tile_cache_size=0',
                                              'sm.num_reader_threads=1'])
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    with pytest.raises(TypeError):
        cfg = tiledbvcf.ReadConfig(abc=123)
示例#9
0
def test_read_limit():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(limit=3)
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end',
                        'fmt_DP', 'fmt_PL'],
                 regions=['1:12100-13360', '1:13500-17350'])
    assert len(df) == 3
示例#10
0
def test_read_multiple_alleles(tmp_path):
    uri = os.path.join(tmp_path, 'dataset')
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='w')
    samples = [os.path.join(TESTS_INPUT_DIR, s) for s in
               ['small3.bcf', 'small.bcf']]
    ds.ingest_samples(samples)

    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r')
    df = ds.read(attrs=['sample_name', 'pos_start', 'alleles', 'id', 'filters'],
                 regions=['1:70100-1300000'])
    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(['HG00280', 'HG00280']),
         'pos_start': pd.Series(
             [866511, 1289367], dtype=np.int32),
         'alleles': pd.Series(map(lambda lst: np.array(lst, dtype=np.object), [
             ['T', 'CCCCTCCCT', 'C', 'CCCCTCCCTCCCT', 'CCCCT'], ['CTG', 'C']])),
         'id': pd.Series(['.', 'rs1497816']),
         'filters': pd.Series(map(lambda lst: np.array(lst, dtype=np.object),
                                  [['LowQual'], ['LowQual']]))})
    _check_dfs(expected_df, df)
示例#11
0
def test_map_incomplete():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(['HG00280', 'HG01762']),
         'pos_start': pd.Series([12141, 12141], dtype=np.int32),
         'pos_end': pd.Series([12277, 12277], dtype=np.int32)})

    # Region partitions
    dask_df = test_ds.map_dask(lambda df: df[df.pos_start * 2 < 25000],
                               attrs=['sample_name', 'pos_start', 'pos_end'],
                               region_partitions=10)
    df = dask_df.compute()
    _check_dfs(expected_df, df)
示例#12
0
def test_incomplete_read_generator():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    overall_df = None
    for df in test_ds.read_iter(attrs=['pos_end'], regions=['1:12700-13400']):
        if overall_df is None:
            overall_df = df
        else:
            overall_df = overall_df.append(df, ignore_index=True)

    assert len(overall_df) == 6
    _check_dfs(pd.DataFrame.from_dict(
        {'pos_end': np.array([12771, 12771, 13374, 13389, 13395, 13413],
                             dtype=np.int32)}), overall_df)
示例#13
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    expected_df = pd.DataFrame(
        {'sample_name': pd.Series(
            ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280',
             'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280',
             'HG00280', 'HG00280', 'HG00280', 'HG00280']),
            'pos_start': pd.Series(
                [12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396,
                 13414, 13452, 13520, 13545, 17319, 17480], dtype=np.int32),
            'pos_end': pd.Series(
                [12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413,
                 13451, 13519, 13544, 13689, 17479, 17486], dtype=np.int32)})

    # Region partitions
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10)
    df = dask_df.compute()
    _check_dfs(expected_df, df)

    # Sample partitions (we have to sort to check the result)
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                sample_partitions=2)
    df = dask_df.compute().sort_values('sample_name').reset_index(drop=True)
    _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True),
               df)

    # Both
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'],
                                region_partitions=10, sample_partitions=2)
    df = dask_df.compute().sort_values('sample_name').reset_index(drop=True)
    _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True),
               df)

    # No partitioning
    dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'])
    df = dask_df.compute()
    _check_dfs(expected_df, df)
示例#14
0
def test_incomplete_reads():
    # Using undocumented "0 MB" budget to test incomplete reads.
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')
    cfg = tiledbvcf.ReadConfig(memory_budget_mb=0)
    test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)

    df = test_ds.read(attrs=['pos_end'], regions=['1:12700-13400'])
    assert not test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(pd.DataFrame.from_dict(
        {'pos_end': np.array([12771, 12771], dtype=np.int32)}), df)

    df = test_ds.continue_read()
    assert not test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(pd.DataFrame.from_dict(
        {'pos_end': np.array([13374, 13389], dtype=np.int32)}), df)

    df = test_ds.continue_read()
    assert test_ds.read_completed()
    assert len(df) == 2
    _check_dfs(pd.DataFrame.from_dict(
        {'pos_end': np.array([13395, 13413], dtype=np.int32)}), df)
示例#15
0
def test_ds():
    return tiledbvcf.TileDBVCFDataset(
        os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples'))
示例#16
-1
def test_sample_and_region_partitioned_read():
    uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(0, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG01762').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(0, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 2
    assert (df.sample_name == 'HG00280').all()

    cfg = tiledbvcf.ReadConfig(region_partition=(1, 2),
                               sample_partition=(1, 2))
    ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
    df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'],
                 regions=['1:12000-13000', '1:17000-18000'])
    assert len(df) == 0