示例#1
0
文件: builders.py 项目: bcajes/hail
    def _finish(self, default):
        assert len(self._cases) > 0

        from hail.expr.functions import cond

        expr = default
        for conditional, then in self._cases[::-1]:
            expr = cond(conditional, then, expr, missing_false=self._missing_false)
        return expr
示例#2
0
文件: builders.py 项目: shulik7/hail
    def _finish(self, default):
        assert len(self._cases) > 0

        from hail.expr.functions import cond

        expr = default
        for conditional, then in self._cases[::-1]:
            expr = cond(conditional, then, expr, missing_false=self._missing_false)
        return expr
示例#3
0
def hwe_normalized_pca(dataset, k=10, compute_loadings=False, as_array=False):
    """Run principal component analysis (PCA) on the Hardy-Weinberg-normalized call matrix.

    Examples
    --------

    >>> eigenvalues, scores, loadings = methods.hwe_normalized_pca(dataset, k=5)

    Notes
    -----
    Variants that are all homozygous reference or all homozygous variant are removed before evaluation.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset.
    k : :obj:`int`
        Number of principal components.
    compute_loadings : :obj:`bool`
        If ``True``, compute row loadings.
    as_array : :obj:`bool`
        If ``True``, return scores and loadings as an array field. If ``False``, return
        one field per element (`PC1`, `PC2`, ... `PCk`).

    Returns
    -------
    (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`)
        List of eigenvalues, table with column scores, table with row loadings.
    """

    dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.num_alt_alleles()),
                                    n_called=agg.count_where(
                                        functions.is_defined(dataset.GT)))
    dataset = dataset.filter_rows(
        (dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)).persist()

    n_variants = dataset.count_rows()
    if n_variants == 0:
        raise FatalError(
            "Cannot run PCA: found 0 variants after filtering out monomorphic sites."
        )
    info("Running PCA using {} variants.".format(n_variants))

    entry_expr = functions.bind(
        dataset.AC / dataset.n_called, lambda mean_gt: functions.cond(
            functions.is_defined(dataset.GT), (dataset.GT.num_alt_alleles(
            ) - mean_gt) / functions.sqrt(mean_gt *
                                          (2 - mean_gt) * n_variants / 2), 0))
    result = pca(entry_expr, k, compute_loadings, as_array)
    dataset.unpersist()
    return result
示例#4
0
文件: builders.py 项目: bcajes/hail
 def f(base):
     # build cond chain bottom-up
     expr = default
     for condition, then in self._cases[::-1]:
         expr = cond(condition, then, expr)
     return expr
示例#5
0
def grm(dataset):
    """Compute the Genetic Relatedness Matrix (GRM).

    .. include:: ../_templates/req_tvariant.rst
    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    >>> km = methods.grm(dataset)

    Notes
    -----

    The genetic relationship matrix (GRM) :math:`G` encodes genetic correlation
    between each pair of samples. It is defined by :math:`G = MM^T` where
    :math:`M` is a standardized version of the genotype matrix, computed as
    follows. Let :math:`C` be the :math:`n \\times m` matrix of raw genotypes
    in the variant dataset, with rows indexed by :math:`n` samples and columns
    indexed by :math:`m` bialellic autosomal variants; :math:`C_{ij}` is the
    number of alternate alleles of variant :math:`j` carried by sample
    :math:`i`, which can be 0, 1, 2, or missing. For each variant :math:`j`,
    the sample alternate allele frequency :math:`p_j` is computed as half the
    mean of the non-missing entries of column :math:`j`. Entries of :math:`M`
    are then mean-centered and variance-normalized as

    .. math::

        M_{ij} = \\frac{C_{ij}-2p_j}{\sqrt{2p_j(1-p_j)m}},

    with :math:`M_{ij} = 0` for :math:`C_{ij}` missing (i.e. mean genotype
    imputation). This scaling normalizes genotype variances to a common value
    :math:`1/m` for variants in Hardy-Weinberg equilibrium and is further
    motivated in the paper `Patterson, Price and Reich, 2006
    <http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.0020190>`__.
    (The resulting amplification of signal from the low end of the allele
    frequency spectrum will also introduce noise for rare variants; common
    practice is to filter out variants with minor allele frequency below some
    cutoff.) The factor :math:`1/m` gives each sample row approximately unit
    total variance (assuming linkage equilibrium) so that the diagonal entries
    of the GRM are approximately 1. Equivalently,

    .. math::

        G_{ik} = \\frac{1}{m} \\sum_{j=1}^m \\frac{(C_{ij}-2p_j)(C_{kj}-2p_j)}{2 p_j (1-p_j)}

    Warning
    -------
    Since Hardy-Weinberg normalization cannot be applied to variants that
    contain only reference alleles or only alternate alleles, all such variants
    are removed prior to calcularing the GRM.

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
        Dataset to sample from.

    Returns
    -------
    :class:`genetics.KinshipMatrix`
        Genetic Relatedness Matrix for all samples.
    :rtype:
    """

    dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.num_alt_alleles()),
                                    n_called=agg.count_where(
                                        functions.is_defined(dataset.GT)))
    dataset = dataset.filter_rows(
        (dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called)).persist()

    n_variants = dataset.count_rows()
    if n_variants == 0:
        raise FatalError(
            "Cannot run GRM: found 0 variants after filtering out monomorphic sites."
        )
    info("Computing GRM using {} variants.".format(n_variants))

    normalized_genotype_expr = functions.bind(
        dataset.AC / dataset.n_called, lambda mean_gt: functions.cond(
            functions.is_defined(dataset.GT), (dataset.GT.num_alt_alleles(
            ) - mean_gt) / functions.sqrt(mean_gt *
                                          (2 - mean_gt) * n_variants / 2), 0))

    bm = BlockMatrix.from_matrix_table(normalized_genotype_expr)
    dataset.unpersist()
    grm = bm.T.dot(bm)

    return KinshipMatrix._from_block_matrix(
        dataset.colkey_schema, grm,
        [row.s
         for row in dataset.cols_table().select('s').collect()], n_variants)
示例#6
0
 def f(base):
     # build cond chain bottom-up
     expr = default
     for condition, then in self._cases[::-1]:
         expr = cond(condition, then, expr)
     return expr