def simulate_relatedness(genotypes, relatedness=.5, n_iter=1000, copy=True): """ Simulate relatedness by randomly copying genotypes between individuals. Parameters ---------- genotypes : array_like An array of shape (n_variants, n_samples, ploidy) where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = first alternate allele, 2 = second alternate allele, etc.). relatedness : float, optional Fraction of variants to copy genotypes for. n_iter : int, optional Number of times to randomly copy genotypes between individuals. copy : bool, optional If False, modify `genotypes` in place. Returns ------- genotypes : ndarray, shape (n_variants, n_samples, ploidy) The input genotype array but with relatedness simulated. """ # check genotypes array genotypes = np.asarray(genotypes) assert genotypes.ndim >= 2 n_variants = genotypes.shape[0] n_samples = genotypes.shape[1] # copy input array if copy: genotypes = genotypes.copy() else: # modify in place pass # determine the number of variants to copy genotypes for n_copy = int(relatedness * n_variants) # iteratively introduce relatedness for i in range(n_iter): # randomly choose donor and recipient donor_index = random.randint(0, n_samples-1) donor = genotypes[:, donor_index] recip_index = random.randint(0, n_samples-1) recip = genotypes[:, recip_index] # randomly pick a set of variants to copy variant_indices = random.sample(range(n_variants), n_copy) # copy across genotypes recip[variant_indices] = donor[variant_indices] return genotypes
def simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing=.1, ploidy=2): """Simulate genotypes at biallelic variants for a population in Hardy-Weinberg equilibrium Parameters ---------- n_variants : int The number of variants. n_samples : int The number of samples. af_dist : frozen continuous random variable The distribution of allele frequencies. p_missing : float, optional The fraction of missing genotype calls. ploidy : int, optional The sample ploidy. Returns ------- genotypes : ndarray, int8 An array of shape (n_variants, n_samples, ploidy) where each element of the array is an integer corresponding to an allele index (-1 = missing, 0 = reference allele, 1 = alternate allele). """ # initialise output array genotypes = np.empty((n_variants, n_samples, ploidy), dtype='i1') # generate allele frequencies under the given distribution af = af_dist.rvs(n_variants) # freeze binomial distribution to model missingness miss_dist = scipy.stats.binom(p=p_missing, n=n_samples) # iterate over variants for i, p in zip(range(n_variants), af): # randomly generate alleles under the given allele frequency # ensure p is valid probability p = min(p, 1) alleles = scipy.stats.bernoulli.rvs(p, size=n_samples*ploidy) # reshape alleles as genotypes under the given ploidy genotypes[i] = alleles.reshape(n_samples, ploidy) # simulate some missingness n_missing = miss_dist.rvs() missing_indices = random.sample(range(n_samples), n_missing) genotypes[i, missing_indices] = (-1,) * ploidy return genotypes
def simulate_genotypes_with_ld(n_variants, n_samples, correlation=0.2): """A very simple function to simulate a set of genotypes, where variants are in some degree of linkage disequilibrium with their neighbours. Parameters ---------- n_variants : int The number of variants to simulate data for. n_samples : int The number of individuals to simulate data for. correlation : float, optional The fraction of samples to copy genotypes between neighbouring variants. Returns ------- gn : ndarray, int8 A 2-dimensional array of shape (n_variants, n_samples) where each element is a genotype call coded as a single integer counting the number of non-reference alleles. """ # initialise an array of random genotypes gn = np.random.randint(size=(n_variants, n_samples), low=0, high=3) gn = gn.astype('i1') # determine the number of samples to copy genotypes for n_copy = int(correlation * n_samples) # introduce linkage disequilibrium by copying genotypes from one sample to # the next for i in range(1, n_variants): # randomly pick the samples to copy from sample_indices = random.sample(range(n_samples), n_copy) # view genotypes from the previous variant for the selected samples c = gn[i-1, sample_indices] # randomly choose whether to invert the correlation inv = random.randint(0, 1) if inv: c = 2-c # copy across genotypes gn[i, sample_indices] = c return gn
def block_apply(f, dataset, block_size=None, out=None): """Apply function `f` to `dataset` split along the first axis into contiguous slices of `block_size`. The result should be equivalent to calling ``f(dataset)`` directly, however may require less total memory, especially if `dataset` is an HDF5 dataset. Parameters ---------- f : function The function to apply. dataset : array_like or HDF5 dataset The input dataset. block_size : int, optional The size (in number of items along `axis`) of the blocks passed to `f`. out : array_like or HDF5 dataset, optional If given, used to store the output. Returns ------- out : ndarray The result of applying `f` to `dataset` blockwise. """ # determine block size if block_size is None: if hasattr(dataset, 'chunks') and dataset.chunks is not None: # use dataset chunk size along slice axis block_size = dataset.chunks[0] else: # use arbitrary number block_size = 1000 # determine total size along slice axis dim_size = dataset.shape[0] # iterate over blocks for block_start in range(0, dim_size, block_size): block_stop = min(block_start + block_size, dim_size) # load input block x = dataset[block_start:block_stop, ...] # compute output block y = f(x) if out is None: # initialise output array out_shape = list(y.shape) out_shape[0] = dim_size out = np.empty(out_shape, y.dtype) # store output block out[block_start:block_stop, ...] = y return out
def block_take2d(dataset, row_indices, col_indices=None, block_size=None): """Select rows and optionally columns from a Numpy array or HDF5 dataset with 2 or more dimensions. Parameters ---------- dataset : array_like or HDF5 dataset The input dataset. row_indices : sequence of ints The indices of the selected rows. N.B., will be sorted in ascending order. col_indices : sequence of ints, optional The indices of the selected columns. If not provided, all columns will be returned. block_size : int, optional The size (in number of rows) of the block of data to process at a time. Returns ------- out : ndarray An array containing the selected rows and columns. See Also -------- anhima.util.block_compress2d, anhima.h5.take2d_pointsel Notes ----- This function is mainly a work-around for the fact that fancy indexing via h5py is currently slow, and fancy indexing along more than one axis is not supported. The function works by reading the entire dataset in blocks of `block_size` rows, and processing each block in memory using numpy. """ # N.B., make sure row_indices are sorted row_indices = np.asarray(row_indices) row_indices.sort() # how many rows are we selecting? n_rows_in = dataset.shape[0] n_rows_out = len(row_indices) # how many columns are we selecting? n_cols_in = dataset.shape[1] if col_indices: n_cols_out = len(col_indices) else: n_cols_out = n_cols_in # setup output array out_shape = (n_rows_out, n_cols_out) + dataset.shape[2:] out = np.empty(out_shape, dtype=dataset.dtype) # determine block size if block_size is None: if hasattr(dataset, 'chunks') and dataset.chunks is not None: # use dataset chunk height block_size = dataset.chunks[0] else: # use arbitrary number block_size = 1000 # iterate block-wise offset = 0 for block_start in range(0, n_rows_in, block_size): block_stop = min(block_start+block_size, n_rows_in) # how many indices to process in this block? i = np.searchsorted(row_indices, block_start) j = np.searchsorted(row_indices, block_stop) n = j-i ridx = row_indices[i:j] # only do anything if there are indices for this block if n: # load data for this block a = dataset[block_start:block_stop] # take rows b = np.take(a, ridx-block_start, axis=0) # take columns if col_indices: b = np.take(b, col_indices, axis=1) # store output out[offset:offset+n, ...] = b # keep track of offset offset += n return out
def take2d_pointsel(dataset, row_indices=None, col_indices=None, block_size=1000): """ Load selected rows and optionally columns from an HDF5 dataset with 2 or more dimensions, using HDF5 point selections. Parameters ---------- dataset : HDF5 dataset The dataset to load data from. row_indices : sequence of ints, optional The indices of the selected rows. If not provided, all rows will be returned. col_indices : sequence of ints, optional The indices of the selected columns. If not provided, all columns will be returned. block_size : int, optional The size (in number of points) of the block of data to load and process at a time. Returns ------- out : ndarray An array containing the selected rows and columns. See Also -------- anhima.util.take2d Notes ----- This function is similar to :func:`anhima.util.take2d` but uses an HDF5 point selection under the hood. Performance characteristics will be different and may be much better or much worse, depending on the size, shape and configuration of the dataset, and depending on the number of points to be selected. """ n_rows_in = dataset.shape[0] if row_indices: row_indices = sorted(row_indices) n_rows_out = len(row_indices) else: # select all rows row_indices = range(n_rows_in) n_rows_out = n_rows_in n_cols_in = dataset.shape[1] if col_indices: # select all columns col_indices = sorted(col_indices) n_cols_out = len(col_indices) else: col_indices = range(n_cols_in) n_cols_out = n_cols_in n_items_out = n_rows_out * n_cols_out # initialise output array out = np.empty((n_items_out,), dtype=dataset.dtype) # convert indices into coordinates coords = itertools.product(row_indices, col_indices) # set up selection sel = h5py._hl.selections.PointSelection(dataset.shape) typ = h5py.h5t.py_create(dataset.dtype) # process blocks at a time for block_start in range(0, n_items_out, block_size): # materialise a block of coordinates selection = np.asarray(list(itertools.islice(coords, block_size))) # set selection sel.set(selection) # read data block_stop = block_start + len(selection) space = h5py.h5s.create_simple(sel.mshape) dataset.id.read(space, sel._id, out[block_start:block_stop], typ) # reshape output array out = out.reshape(n_rows_out, n_cols_out) return out