def test_sample_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(sample_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-18000']) assert len(df) == 11 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(sample_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-18000']) assert len(df) == 3 assert (df.sample_name == 'HG01762').all() # Error: too many partitions cfg = tiledbvcf.ReadConfig(sample_partition=(1, 3)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) with pytest.raises(RuntimeError): df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-18000']) # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(sample_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
def test_read_null_attrs(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.TileDBVCFDataset(uri, mode='w') samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ['small3.bcf', 'small.bcf']] ds.ingest_samples(samples) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r') df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end', 'info_BaseQRankSum', 'info_DP', 'fmt_DP'], regions=['1:12700-13400', '1:69500-69800']) expected_df = pd.DataFrame( {'sample_name': pd.Series( ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280']), 'pos_start': pd.Series( [12546, 12546, 13354, 13354, 13375, 13396, 69371, 69511, 69512, 69761, 69762, 69771], dtype=np.int32), 'pos_end': pd.Series( [12771, 12771, 13374, 13389, 13395, 13413, 69510, 69511, 69760, 69761, 69770, 69834], dtype=np.int32), 'info_BaseQRankSum': pd.Series( [None, None, None, None, None, None, None, np.array([-0.787], dtype=np.float32), None, np.array([1.97], dtype=np.float32), None, None]), 'info_DP': pd.Series([None, None, None, None, None, None, None, np.array([89], dtype=np.int32), None, np.array([24], dtype=np.int32), None, None]), 'fmt_DP': pd.Series(map(lambda lst: np.array(lst, dtype=np.int32), [[0], [0], [15], [64], [6], [2], [180], [88], [97], [24], [23], [21]]))}) _check_dfs(expected_df, df)
def test_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(region_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 4 cfg = tiledbvcf.ReadConfig(region_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 # Too many partitions still produces results cfg = tiledbvcf.ReadConfig(region_partition=(1, 3)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 # Error: index >= num partitions cfg = tiledbvcf.ReadConfig(region_partition=(2, 2)) with pytest.raises(RuntimeError): ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
def test_tbb_threads_config(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=3']) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) cfg = tiledbvcf.ReadConfig(tiledb_config=['sm.num_tbb_threads=4']) with pytest.raises(RuntimeError): ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg)
def test_incremental_ingest(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.TileDBVCFDataset(uri, mode='w') ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small.bcf')]) ds.ingest_samples([os.path.join(TESTS_INPUT_DIR, 'small2.bcf')]) # Open it back in read mode and check some queries ds = tiledbvcf.TileDBVCFDataset(uri, mode='r') assert ds.count() == 14 assert ds.count(regions=['1:12700-13400']) == 6 assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
def test_read_write_mode_exceptions(): ds = tiledbvcf.TileDBVCFDataset( os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples')) samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ['small.bcf', 'small2.bcf']] with pytest.raises(Exception): ds.ingest_samples(samples) ds = tiledbvcf.TileDBVCFDataset( os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples'), mode='w') with pytest.raises(Exception): ds.count()
def test_basic_ingest(tmp_path): # Create the dataset uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.TileDBVCFDataset(uri, mode='w') samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ['small.bcf', 'small2.bcf']] ds.ingest_samples(samples) # Open it back in read mode and check some queries ds = tiledbvcf.TileDBVCFDataset(uri, mode='r') assert ds.count() == 14 assert ds.count(regions=['1:12700-13400']) == 6 assert ds.count(samples=['HG00280'], regions=['1:12700-13400']) == 4
def test_read_config(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig() ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) cfg = tiledbvcf.ReadConfig(memory_budget_mb=512, region_partition=(0, 3), tiledb_config=['sm.tile_cache_size=0', 'sm.num_reader_threads=1']) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) with pytest.raises(TypeError): cfg = tiledbvcf.ReadConfig(abc=123)
def test_read_limit(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(limit=3) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end', 'fmt_DP', 'fmt_PL'], regions=['1:12100-13360', '1:13500-17350']) assert len(df) == 3
def test_read_multiple_alleles(tmp_path): uri = os.path.join(tmp_path, 'dataset') ds = tiledbvcf.TileDBVCFDataset(uri, mode='w') samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ['small3.bcf', 'small.bcf']] ds.ingest_samples(samples) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r') df = ds.read(attrs=['sample_name', 'pos_start', 'alleles', 'id', 'filters'], regions=['1:70100-1300000']) expected_df = pd.DataFrame( {'sample_name': pd.Series(['HG00280', 'HG00280']), 'pos_start': pd.Series( [866511, 1289367], dtype=np.int32), 'alleles': pd.Series(map(lambda lst: np.array(lst, dtype=np.object), [ ['T', 'CCCCTCCCT', 'C', 'CCCCTCCCTCCCT', 'CCCCT'], ['CTG', 'C']])), 'id': pd.Series(['.', 'rs1497816']), 'filters': pd.Series(map(lambda lst: np.array(lst, dtype=np.object), [['LowQual'], ['LowQual']]))}) _check_dfs(expected_df, df)
def test_map_incomplete(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) expected_df = pd.DataFrame( {'sample_name': pd.Series(['HG00280', 'HG01762']), 'pos_start': pd.Series([12141, 12141], dtype=np.int32), 'pos_end': pd.Series([12277, 12277], dtype=np.int32)}) # Region partitions dask_df = test_ds.map_dask(lambda df: df[df.pos_start * 2 < 25000], attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10) df = dask_df.compute() _check_dfs(expected_df, df)
def test_incomplete_read_generator(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) overall_df = None for df in test_ds.read_iter(attrs=['pos_end'], regions=['1:12700-13400']): if overall_df is None: overall_df = df else: overall_df = overall_df.append(df, ignore_index=True) assert len(overall_df) == 6 _check_dfs(pd.DataFrame.from_dict( {'pos_end': np.array([12771, 12771, 13374, 13389, 13395, 13413], dtype=np.int32)}), overall_df)
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) expected_df = pd.DataFrame( {'sample_name': pd.Series( ['HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG01762', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280', 'HG00280']), 'pos_start': pd.Series( [12141, 12141, 12546, 12546, 13354, 13354, 13375, 13396, 13414, 13452, 13520, 13545, 17319, 17480], dtype=np.int32), 'pos_end': pd.Series( [12277, 12277, 12771, 12771, 13374, 13389, 13395, 13413, 13451, 13519, 13544, 13689, 17479, 17486], dtype=np.int32)}) # Region partitions dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10) df = dask_df.compute() _check_dfs(expected_df, df) # Sample partitions (we have to sort to check the result) dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], sample_partitions=2) df = dask_df.compute().sort_values('sample_name').reset_index(drop=True) _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True), df) # Both dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end'], region_partitions=10, sample_partitions=2) df = dask_df.compute().sort_values('sample_name').reset_index(drop=True) _check_dfs(expected_df.sort_values('sample_name').reset_index(drop=True), df) # No partitioning dask_df = test_ds.read_dask(attrs=['sample_name', 'pos_start', 'pos_end']) df = dask_df.compute() _check_dfs(expected_df, df)
def test_incomplete_reads(): # Using undocumented "0 MB" budget to test incomplete reads. uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(memory_budget_mb=0) test_ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = test_ds.read(attrs=['pos_end'], regions=['1:12700-13400']) assert not test_ds.read_completed() assert len(df) == 2 _check_dfs(pd.DataFrame.from_dict( {'pos_end': np.array([12771, 12771], dtype=np.int32)}), df) df = test_ds.continue_read() assert not test_ds.read_completed() assert len(df) == 2 _check_dfs(pd.DataFrame.from_dict( {'pos_end': np.array([13374, 13389], dtype=np.int32)}), df) df = test_ds.continue_read() assert test_ds.read_completed() assert len(df) == 2 _check_dfs(pd.DataFrame.from_dict( {'pos_end': np.array([13395, 13413], dtype=np.int32)}), df)
def test_ds(): return tiledbvcf.TileDBVCFDataset( os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples'))
def test_sample_and_region_partitioned_read(): uri = os.path.join(TESTS_INPUT_DIR, 'arrays/ingested_2samples') cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(region_partition=(0, 2), sample_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG01762').all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(0, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 2 assert (df.sample_name == 'HG00280').all() cfg = tiledbvcf.ReadConfig(region_partition=(1, 2), sample_partition=(1, 2)) ds = tiledbvcf.TileDBVCFDataset(uri, mode='r', cfg=cfg) df = ds.read(attrs=['sample_name', 'pos_start', 'pos_end'], regions=['1:12000-13000', '1:17000-18000']) assert len(df) == 0