def test_raise_on_both_path_types(): with pytest.raises( ValueError, match= "Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both", ): read_plink(path="x", bed_path="x")
def test_same_as_the_reference_implementation() -> None: """ This test validates that our implementation gets exactly the same results as the reference R implementation. """ d = Path(__file__).parent ds = read_plink(path="hapmap_JPT_CHB_r23a_filtered") pcs = da.from_array( pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy() ) ds[sample_pca_projection] = (("samples", "components"), pcs) phi = pc_relate(ds).pc_relate_phi.compute() n_samples = 90 assert isinstance(phi, xr.DataArray) assert phi.shape == (n_samples, n_samples) # Get genesis/reference results: genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv")) genesis_phi = genesis_phi[["kin"]].to_numpy() phi_s = phi.data[np.triu_indices_from(phi.data, 1)] # type: ignore[no-untyped-call] assert phi_s.size == genesis_phi.size assert np.allclose(phi_s, genesis_phi.T)
def test_read_multi_path(shared_datadir, ds1): path = shared_datadir / example_dataset_1 ds2 = read_plink( bed_path=path.with_suffix(".bed"), bim_path=path.with_suffix(".bim"), fam_path=path.with_suffix(".fam"), bim_sep="\t", fam_sep="\t", ) xr.testing.assert_equal(ds1, ds2)
def load_plink(paths: PLINKPaths, contig: Contig) -> Dataset: logger.info( f"Loading PLINK dataset for contig {contig} from {paths.bed_path}") with dask.config.set(scheduler="threads"): ds = read_plink( bed_path=paths.bed_path, bim_path=paths.bim_path, fam_path=paths.fam_path, bim_int_contig=False, count_a1=False, ) ds["sample_id"] = ds["sample_id"].astype("int32") # All useful sample metadata will come from the # main UKB dataset instead ds = ds.drop_vars([ "sample_family_id", "sample_paternal_id", "sample_maternal_id", "sample_phenotype", ]) # Update contig index/names ds = transform_contig(ds, contig) return ds
import urllib.request from sgkit.io.plink import read_plink if __name__ == "__main__": for ext in (".bed", ".bim", ".fam"): urllib.request.urlretrieve( f"https://github.com/pystatgen/sgkit/raw/main/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss{ext}", f"plink_sim_10s_100v_10pmiss{ext}", ) ds = read_plink(path="plink_sim_10s_100v_10pmiss") print(ds)
def ds1(shared_datadir, request): path = shared_datadir / example_dataset_1 return read_plink(path=path, bim_sep="\t", fam_sep="\t", **request.param)