def test_bgen_reader_with_wrong_metadata_file(): filepath = example_filepath("example.32bits.bgen") filepath.touch() metafile_filepath = example_filepath("wrong.metadata") metafile_filepath.touch() # make sure that the metafile has a later timestamp (otherwise, it might be re-created) with pytest.raises(RuntimeError): read_bgen(filepath, verbose=False, metafile_filepath=metafile_filepath)
def test_metafile_provided_not_supported_anymore(): with pytest.raises(RuntimeError): read_bgen( example_filepath("haplotypes.bgen"), metafile_filepath=example_filepath("haplotypes.bgen.metadata.valid"), verbose=False, )
def test_allele_expectation(): filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen2: e = bgen2.allele_expectation(np.s_[bgen2.samples == "sample_005", bgen2.rsids == "RSID_6"]) assert np.allclose(e, [[[1.01086423, 0.98913577]]]) with pytest.raises(ValueError): filepath = example_filepath("haplotypes.bgen") with open_bgen(filepath, verbose=False) as bgen2: bgen2.allele_expectation() filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen2: e = bgen2.allele_expectation(np.s_[:, []]) assert e.shape == (500, 0, 2) filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen2: e = bgen2.allele_expectation( np.s_[bgen2.samples == "sample_005", bgen2.rsids == "RSID_6"], assume_constant_ploidy=False, ) assert np.allclose(e, [[[1.01086423, 0.98913577]]]) filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen2: e = bgen2.allele_expectation(np.s_[:, []], assume_constant_ploidy=False) assert e.shape == (500, 0, 2)
def test_bgen_samples_outside_bgen_unreadable(tmp_path): bgen_filepath = example_filepath("complex.23bits.bgen") samples_filepath = tmp_path / "complex.sample" copyfile(example_filepath("complex.sample"), samples_filepath) with noread_permission(samples_filepath): with pytest.raises(PermissionError): read_bgen(bgen_filepath, samples_filepath=samples_filepath, verbose=False)
def test_bgen_samples_specify_samples_file(): data = read_bgen( example_filepath("complex.23bits.bgen"), samples_filepath=example_filepath("complex.sample"), verbose=False, ) samples = ["sample_0", "sample_1", "sample_2", "sample_3"] samples = Series(samples, dtype=str, name="id") assert all(data["samples"] == samples)
def test_allele_expectation_interface(): bgen = read_bgen(example_filepath("haplotypes.bgen"), verbose=False) with pytest.raises(ValueError): allele_expectation(bgen, 1) bgen = read_bgen(example_filepath("complex.23bits.bgen"), verbose=False) e = allele_expectation(bgen, 3) assert_allclose( e, [[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [1.0, 1.0, 0.0], [0.0, 2.0, 0.0]])
def test_zero_width(): filepath = example_filepath("complex.bgen") with open_bgen(filepath, allow_complex=True, verbose=False) as bgen: for assume_constant_ploidy in [False, True]: e = bgen.allele_expectation( [], assume_constant_ploidy=assume_constant_ploidy, ) f = bgen.allele_frequency(e) assert e.shape == (bgen.nsamples, 0, bgen.nalleles[0]) assert f.shape == (0, bgen.nalleles[0]) good_variants = logical_not(bgen.phased) * (bgen.nalleles == 2) e = bgen.allele_expectation( ([], good_variants), assume_constant_ploidy=assume_constant_ploidy, ) f = bgen.allele_frequency(e) assert e.shape == (0, sum(good_variants), bgen.nalleles[0]) assert_equal(f, zeros( (sum(good_variants), bgen.nalleles[0] ))) # We define the freq of something with no samples as 0 e = bgen.allele_expectation( ([], []), assume_constant_ploidy=assume_constant_ploidy, ) f = bgen.allele_frequency(e) assert e.shape == (0, 0, bgen.nalleles[0]) assert f.shape == (0, bgen.nalleles[0])
def test_dosage_example_32bits(): filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen: e = bgen.allele_expectation([5, 0]) assert_allclose(e[7, 0, :], [1.9556273911044997, 0.044372608895500334]) assert all(isnan(e[0, 1, :])) assert_equal(e.shape, (500, 2, 2))
def test_coverage(self): from pysnptools.distreader import DistGen with example_filepath("example.32bits.bgen") as filepath: bgen = Bgen(filepath, fresh_properties=False, iid_function=lambda sam: ("X", sam)) assert bgen.iid[0, 0] == "X" metadata_filepath = bgen._open_bgen._metadata2_path metadata2_temp = metadata_filepath.parent / ( metadata_filepath.name + ".temp") del bgen if metadata2_temp.exists(): metadata2_temp.unlink() os.rename(metadata_filepath, metadata2_temp) bgen = Bgen(filepath) assert bgen.iid[0, 0] == "0" bgen[0, 0].read(order='A') if not os.path.exists("temp"): os.mkdir("temp") os.chdir("temp") file1x = "coverage.bgen" Bgen.write(file1x, bgen[:100, :100]) Bgen.write(file1x, bgen[:100, :100]) os.chdir("..") distgen0data = DistGen(seed=332, iid_count=10010, sid_count=5).read() file1 = "temp/roundtrip1-big.bgen" bed3 = Bgen.write(file1, distgen0data, bits=8, compression="zlib", cleanup_temp_files=False, sample_function=lambda fam, ind: f'{fam},{ind}') bed3.iid[0, 0] = '0'
def test_bgen_reader_phased_genotype(): filepath = example_filepath("haplotypes.bgen") bgen = read_bgen(filepath, verbose=False) variants = bgen["variants"] samples = bgen["samples"] v = variants.loc[0].compute() assert_equal(v["chrom"].values[0], "1") assert_equal(v["id"].values[0], "SNP1") assert_equal(v["nalleles"].values[0], 2) assert_equal(v["allele_ids"].values[0], "A,G") assert_equal(v["pos"].values[0], 1) assert_equal(v["rsid"].values[0], "RS1") v = variants.loc[2].compute() assert_equal(v["chrom"].values[0], "1") assert_equal(v["id"].values[0], "SNP3") assert_equal(v["nalleles"].values[0], 2) assert_equal(v["allele_ids"].values[0], "A,G") assert_equal(v["pos"].values[0], 3) assert_equal(v["rsid"].values[0], "RS3") assert_equal(samples.loc[0], "sample_0") assert_equal(samples.loc[2], "sample_2") n = samples.shape[0] assert_equal(samples.loc[n - 1], "sample_3") g = bgen["genotype"][0].compute() assert_allclose(g["probs"][0], [1.0, 0.0, 1.0, 0.0]) k = len(variants) n = len(samples) g = bgen["genotype"][k - 1].compute() assert_allclose(g["probs"][n - 1], [1.0, 0.0, 0.0, 1.0])
def test_bgen_reader_with_nonexistent_metadata_file(): filepath = example_filepath("example.32bits.bgen") folder = os.path.dirname(filepath) metafile_filepath = os.path.join(folder, "nonexistent.metadata") with pytest.raises(FileNotFoundError): with pytest.warns(UserWarning): read_bgen(filepath, verbose=False, metafile_filepath=metafile_filepath)
def test_bgen_samples_specify_samples_file(): data = open_bgen( example_filepath2("complex.23bits.bgen"), samples_filepath=example_filepath("complex.sample"), verbose=False, ) samples = ["sample_0", "sample_1", "sample_2", "sample_3"] assert all(data.samples == samples)
def test_metafile_not_provided_no_permission_to_create(tmp_path): src = example_filepath("haplotypes.bgen") dst = tmp_path / "haplotypes.bgen" copyfile(src, dst) path = os.path.dirname(dst) with nowrite_permission(path): with pytest.warns(UserWarning): read_bgen(dst, verbose=False)
def test_read_bgem_interface(): filepath = example_filepath("haplotypes.bgen") bgen = read_bgen(filepath, verbose=False) assert isinstance(bgen, dict) assert isinstance(bgen["variants"], dd.DataFrame) assert isinstance(bgen["samples"], Series) assert isinstance(bgen["genotype"], list) assert isinstance(bgen["genotype"][0], Delayed)
def example_filepath2(filename): filepath = example_filepath(filename) for allow_complex in [False, True]: metadata2_path = open_bgen._metadata_path_from_filename( filepath, samples_filepath=None, allow_complex=allow_complex) if metadata2_path.exists(): metadata2_path.unlink() return filepath
def test_freq(): filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen: variant_index = bgen.rsids == "RSID_6" e = bgen.allele_expectation(variant_index) f = bgen.allele_frequency(e) assert_allclose(f[0, 0], 229.23103218810434) assert_allclose(f[0, 1], 270.7689678118956)
def test_threads(): filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen2: for num_threads in [1, 2]: for slice in [np.s_[:, :], np.s_[:, []]]: val = bgen2.read(index=slice, num_threads=num_threads) row_count = len(bgen2.samples[slice[0]]) col_count = len(bgen2.ids[slice[1]]) assert val.shape == (row_count, col_count, 3)
def test_bgen_samples_specify_samples_file(): with open_bgen( example_filepath2("complex.23bits.bgen"), samples_filepath=example_filepath("complex.sample"), allow_complex=True, verbose=False, ) as data: samples = ["sample_0", "sample_1", "sample_2", "sample_3"] assert all(data.samples == samples)
def test_bgen_reader_without_metadata(): filepath = example_filepath("example.32bits.bgen") bgen = read_bgen(filepath, verbose=False) variants = bgen["variants"].compute() samples = bgen["samples"] assert "genotype" in bgen assert_equal(variants.loc[7, "allele_ids"], "A,G") n = samples.shape[0] assert_equal(samples.loc[n - 1], "sample_500")
def test_bgen_samples_inside_bgen(self): with example_filepath("example.32bits.bgen") as filepath: data = Bgen(filepath) samples = [ ("0", "sample_001"), ("0", "sample_002"), ("0", "sample_003"), ("0", "sample_004"), ] assert (data.iid[:4] == samples).all()
def test_create_metadata_file(tmp_path): filepath = example_filepath("example.32bits.bgen") metafile_filepath = tmp_path / (filepath.name + ".metadata") try: create_metafile(filepath, metafile_filepath, verbose=False) assert os.path.exists(metafile_filepath) finally: if os.path.exists(metafile_filepath): os.remove(metafile_filepath)
def test_bgen_reader_complex_sample_file(): bgen = read_bgen( example_filepath("complex.23bits.bgen"), samples_filepath=example_filepath("complex.sample"), verbose=False, ) variants = bgen["variants"].compute() samples = bgen["samples"] assert "genotype" in bgen assert_equal(variants.loc[0, "chrom"], "01") assert_equal(variants.loc[0, "id"], "") assert_equal(variants.loc[0, "nalleles"], 2) assert_equal(variants.loc[0, "allele_ids"], "A,G") assert_equal(variants.loc[0, "pos"], 1) assert_equal(variants.loc[0, "rsid"], "V1") assert_equal(variants.loc[7, "chrom"], "01") assert_equal(variants.loc[7, "id"], "") assert_equal(variants.loc[7, "nalleles"], 7) assert_equal(variants.loc[7, "allele_ids"], "A,G,GT,GTT,GTTT,GTTTT,GTTTTT") assert_equal(variants.loc[7, "pos"], 8) assert_equal(variants.loc[7, "rsid"], "M8") n = variants.shape[0] assert_equal(variants.loc[n - 1, "chrom"], "01") assert_equal(variants.loc[n - 1, "id"], "") assert_equal(variants.loc[n - 1, "nalleles"], 2) assert_equal(variants.loc[n - 1, "allele_ids"], "A,G") assert_equal(variants.loc[n - 1, "pos"], 10) assert_equal(variants.loc[n - 1, "rsid"], "M10") assert_equal(samples.loc[0], "sample_0") assert_equal(samples.loc[3], "sample_3") ploidy = bgen["genotype"][2].compute()["ploidy"] missing = bgen["genotype"][2].compute()["missing"] nvariants = len(variants) phased = [bgen["genotype"][i].compute()["phased"] for i in range(nvariants)] assert_allclose(ploidy, [1, 2, 2, 2]) assert_allclose(missing, [0, 0, 0, 0]) assert_allclose(phased, [0, 1, 1, 0, 1, 1, 1, 1, 0, 0])
def test_dosage_example_32bits(): filepath = example_filepath("example.32bits.bgen") bgen = read_bgen(filepath, verbose=False) e = allele_expectation(bgen, 5) assert_allclose(e[7], [1.9556273911044997, 0.044372608895500334]) e = allele_expectation(bgen, 0) assert all(isnan(e[0])) e = allele_expectation(bgen, 0) assert_equal(e.shape, (500, 2))
def test_bgen_reader_complex(): filepath = example_filepath("complex.23bits.bgen") bgen = read_bgen(filepath, verbose=False) variants = bgen["variants"].compute() samples = bgen["samples"] assert "genotype" in bgen assert_equal(variants.loc[0, "chrom"], "01") assert_equal(variants.loc[0, "id"], "") assert_equal(variants.loc[0, "nalleles"], 2) assert_equal(variants.loc[0, "allele_ids"], "A,G") assert_equal(variants.loc[0, "pos"], 1) assert_equal(variants.loc[0, "rsid"], "V1") assert_equal(variants.loc[7, "chrom"], "01") assert_equal(variants.loc[7, "id"], "") assert_equal(variants.loc[7, "nalleles"], 7) assert_equal(variants.loc[7, "allele_ids"], "A,G,GT,GTT,GTTT,GTTTT,GTTTTT") assert_equal(variants.loc[7, "pos"], 8) assert_equal(variants.loc[7, "rsid"], "M8") n = variants.shape[0] assert_equal(variants.loc[n - 1, "chrom"], "01") assert_equal(variants.loc[n - 1, "id"], "") assert_equal(variants.loc[n - 1, "nalleles"], 2) assert_equal(variants.loc[n - 1, "allele_ids"], "A,G") assert_equal(variants.loc[n - 1, "pos"], 10) assert_equal(variants.loc[n - 1, "rsid"], "M10") assert_equal(samples.loc[0], "sample_0") assert_equal(samples.loc[3], "sample_3") g = bgen["genotype"][0].compute()["probs"][0] assert_allclose(g[:2], [1, 0]) assert isnan(g[2]) g = bgen["genotype"][0].compute()["probs"][1] assert_allclose(g[:3], [1, 0, 0]) g = bgen["genotype"][-1].compute()["probs"][-1] assert_allclose(g[:5], [0, 0, 0, 1, 0]) ploidy = bgen["genotype"][0].compute()["ploidy"] assert_allclose(ploidy, [1, 2, 2, 2]) ploidy = bgen["genotype"][-1].compute()["ploidy"] assert_allclose(ploidy, [4, 4, 4, 4]) nvariants = len(variants) phased = [bgen["genotype"][i].compute()["phased"] for i in range(nvariants)] phased = array(phased) assert_equal(phased.dtype.name, "bool") ideal = array([False, True, True, False, True, True, True, True, False, False]) assert array_equal(phased, ideal)
def test_dosage1(): filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen: variant_index = 3 e = bgen.allele_expectation(variant_index) # Compute the dosage when considering the allele # in position 1 as the reference/alternative one. alt_allele_index = 1 dosage = e[..., alt_allele_index] # Print the dosage of the first five samples only. # print(dosage[:5]) assert_allclose(dosage[:2, 0], [1.9618530841455453, 0.009826655967586362])
def test_bgen_reader_variants_info(): filepath = example_filepath("example.32bits.bgen") bgen = read_bgen(filepath, verbose=False) variants = bgen["variants"] samples = bgen["samples"] assert "genotype" in bgen variants = variants.compute() assert_equal(variants.loc[0, "chrom"], "01") assert_equal(variants.loc[0, "id"], "SNPID_2") assert_equal(variants.loc[0, "nalleles"], 2) assert_equal(variants.loc[0, "allele_ids"], "A,G") assert_equal(variants.loc[0, "pos"], 2000) assert_equal(variants.loc[0, "rsid"], "RSID_2") assert_equal(variants.loc[7, "chrom"], "01") assert_equal(variants.loc[7, "id"], "SNPID_9") assert_equal(variants.loc[7, "nalleles"], 2) assert_equal(variants.loc[7, "allele_ids"], "A,G") assert_equal(variants.loc[7, "pos"], 9000) assert_equal(variants.loc[7, "rsid"], "RSID_9") n = variants.shape[0] assert_equal(variants.loc[n - 1, "chrom"], "01") assert_equal(variants.loc[n - 1, "id"], "SNPID_200") assert_equal(variants.loc[n - 1, "nalleles"], 2) assert_equal(variants.loc[n - 1, "allele_ids"], "A,G") assert_equal(variants.loc[n - 1, "pos"], 100001) assert_equal(variants.loc[n - 1, "rsid"], "RSID_200") assert_equal(samples.loc[0], "sample_001") assert_equal(samples.loc[7], "sample_008") n = samples.shape[0] assert_equal(samples.loc[n - 1], "sample_500") g = bgen["genotype"][0].compute()["probs"] assert all(isnan(g[0, :])) g = bgen["genotype"][0].compute()["probs"] a = [0.027802362811705648, 0.00863673794284387, 0.9635608992454505] assert_allclose(g[1, :], a) b = [ 0.97970582847010945215516, 0.01947019668749305418287, 0.00082397484239749366197, ] g = bgen["genotype"][1].compute()["probs"] assert_allclose(g[2, :], b)
def test_coverage3(): with pytest.raises(ValueError): with open_bgen( example_filepath2("example.bgen"), samples_filepath=example_filepath( "complex.sample"), # Wrong size sample file verbose=False, ) as _: pass with pytest.raises(ValueError): with open_bgen( example_filepath2("complex.bgen"), verbose=False, ) as _: pass
def test_allele_frequency_interface(): filepath = example_filepath("complex.23bits.bgen") with pytest.raises(ValueError): bgen = read_bgen(filepath, verbose=False) allele_expectation(bgen, 1) bgen = read_bgen(filepath, verbose=False) expec = allele_expectation(bgen, 3) freq = allele_frequency(expec) assert_allclose(freq, [1.33333333333, 1.0, 0.0]) freq = allele_frequency([[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [1.0, 1.0, 0.0], [0.0, 2.0, 0.0]]) assert_allclose(freq, [1.33333333333, 1.0, 0.0]) with pytest.raises(ValueError): allele_frequency([2, 3, 1])
def test_error(): filepath = example_filepath("complex.bgen") with open_bgen(filepath, allow_complex=True, verbose=False) as bgen: with pytest.raises(ValueError): bgen.allele_expectation() # some phased with pytest.raises(ValueError): # different #'s of alleles bgen.allele_expectation(logical_not(bgen.phased)) with pytest.raises(ValueError): # nonconstant ploidy bgen.allele_expectation( logical_not(bgen.phased) * (bgen.nalleles == 2)) e = bgen.allele_expectation( logical_not(bgen.phased) * (bgen.nalleles == 2), assume_constant_ploidy=False, ) f = bgen.allele_frequency(e) assert_allclose(e[-1, -1, :], [1.0, 3.0]) assert_allclose(f[-1, :], [5.0, 3.0])
def test_dosage2(): import numpy as np import pandas as pd filepath = example_filepath("example.32bits.bgen") with open_bgen(filepath, verbose=False) as bgen: variant_index = [3] assert bgen.ids[variant_index] == "SNPID_5" assert bgen.rsids[variant_index] == "RSID_5" probs, missing, ploidy = bgen.read(variant_index, return_missings=True, return_ploidies=True) assert not np.any(missing) assert np.all(ploidy == 2) df1 = pd.DataFrame({ "sample": bgen.samples, "0": probs[:, 0, 0], "1": probs[:, 0, 1], "2": probs[:, 0, 2], }) # print(df1) assert_allclose(df1.iloc[-1, -1], 0.015471935508649781) alleles_per_variant = [ allele_ids.split(",") for allele_ids in bgen.allele_ids[variant_index] ] e = bgen.allele_expectation(variant_index) f = bgen.allele_frequency(e) df2 = pd.DataFrame({ "sample": bgen.samples, alleles_per_variant[0][0]: e[:, 0, 0], alleles_per_variant[0][1]: e[:, 0, 1], }) # print(df2) # doctest: +NORMALIZE_WHITESPACE assert_allclose(df2.iloc[-1, -1], 1.0152583189809832) alt_index = f[0, :].argmin() alt = alleles_per_variant[0][alt_index] dosage = e[:, 0, alt_index] df4 = pd.DataFrame({"sample": bgen.samples, f"alt={alt}": dosage}) assert_allclose(df4.iloc[-1, -1], 1.0152583189809832)