Пример #1
0
    def dataframe_to_keytable(self, df, keys=[]):
        """Convert Spark SQL DataFrame to KeyTable.

        Spark SQL data types are converted to Hail types in the obvious way as follows:

        .. code-block:: text

          BooleanType => Boolean
          IntegerType => Int
          LongType => Long
          FloatType => Float
          DoubleType => Double
          StringType => String
          BinaryType => Binary
          ArrayType => Array
          StructType => Struct

        Unlisted Spark SQL data types are currently unsupported.

        :param keys: List of key column names.
        :type keys: list of string

        :return: The DataFrame as a KeyTable.
        :rtype: :class:`.KeyTable`
        """

        jkeys = jarray(self._jvm.java.lang.String, keys)
        return KeyTable(self,
                        self._hail.keytable.KeyTable.fromDF(df._jdf, jkeys))
Пример #2
0
    def _run_command(self, vds, pargs):
        jargs = jarray(self._jvm.java.lang.String, pargs)
        t = self._hail.driver.ToplevelCommands.lookup(jargs)
        cmd = t._1()
        cmd_args = t._2()
        jstate = self._jstate(vds._jvds if vds != None else None)

        try:
            result = cmd.run(jstate, cmd_args)
        except Py4JJavaError as e:
            raise_py4j_exception(e)

        return VariantDataset(self, result.vds())
Пример #3
0
    def import_keytable(self, path, key_names, npartitions=None, config=None):
        """Import delimited text file (text table) as KeyTable.

        :param path: files to import.
        :type path: str or list of str

        :param key_names: The name(s) of fields to be considered keys
        :type key_names: str or list of str

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig` or None

        :rtype: :class:`.KeyTable`
        """

        path_args = []
        if isinstance(path, str):
            path_args.append(path)
        else:
            for p in path:
                path_args.append(p)

        if not isinstance(key_names, str):
            key_names = ','.join(key_names)

        if not npartitions:
            npartitions = self.sc.defaultMinPartitions

        if not config:
            config = TextTableConfig()

        return KeyTable(
            self,
            self._hail.keytable.KeyTable.importTextTable(
                self._jsc, jarray(self._jvm.java.lang.String, path_args),
                key_names, npartitions, config._to_java()))
Пример #4
0
    def balding_nichols_model(self,
                              populations,
                              samples,
                              variants,
                              partitions=None,
                              pop_dist=None,
                              fst=None,
                              af_dist=UniformDist(0.1, 0.9),
                              seed=0):
        """Generate a VariantDataset using the Balding-Nichols model.

        **Examples**

        To generate a VDS with 3 populations, 100 samples in total, and 1000 variants:

        >>> vds = hc.balding_nichols_model(3, 100, 1000)

        To generate a VDS with 4 populations, 2000 samples, 5000 variants, 10 partitions, population distribution [0.1, 0.2, 0.3, 0.4], :math:`F_st` values [.02, .06, .04, .12], ancestral allele frequencies drawn from a truncated beta distribution with a = .01 and b = .05 over the interval [0.05, 1], and random seed 1:

        >>> vds = hc.balding_nichols_model(4, 40, 150, 10,
        >>>     pop_dist=[0.1, 0.2, 0.3, 0.4],
        >>>     fst=[.02, .06, .04, .12],
        >>>     af_dist=TruncatedBetaDist(a=0.01, b=2.0, minVal=0.05, maxVal=1.0),
        >>>     seed=1)

        **Notes**

        Hail is able to randomly generate a VDS using the Balding-Nichols model.

        - :math:`K` populations are labeled by integers 0, 1, ..., K - 1
        - :math:`N` samples are named by strings 0, 1, ..., N - 1
        - :math:`M` variants are defined as ``1:1:A:C``, ``1:2:A:C``, ..., ``1:M:A:C``
        - The default ancestral frequency distribution :math:`P_0` is uniform on [0.1, 0.9]. Options are UniformDist(minVal, maxVal), BetaDist(a, b), and TruncatedBetaDist(a, b, minVal, maxVal)
        - The population distribution :math:`\pi` defaults to uniform
        - The :math:`F_{st}` values default to 0.1
        - The number of partitions defaults to one partition per million genotypes (i.e., samples * variants / 10^6) or 8, whichever is larger

        The Balding-Nichols model models genotypes of individuals from a structured population comprising :math:`K` homogeneous subpopulations
        that have each diverged from a single ancestral population (a `star phylogeny`). We take :math:`N` samples and :math:`M` bi-allelic variants in perfect
        linkage equilibrium. The relative sizes of the subpopulations are given by a probability vector :math:`\pi`; the ancestral allele frequencies are
        drawn independently from a frequency spectrum :math:`P_0`; the subpopulations have diverged with possibly different :math:`F_{ST}` parameters :math:`F_k`
        (here and below, lowercase indices run over a range bounded by the corresponding uppercase parameter, e.g. :math:`k = 1, \ldots, K`).
        For each variant, the subpopulation allele frequencies are drawn a `beta distribution <https://en.wikipedia.org/wiki/Beta_distribution>`_, a useful continuous approximation of
        the effect of genetic drift. We denote the individual subpopulation memberships by :math:`k_n`, the ancestral allele frequences by :math:`p_{0, m}`,
        the subpopulation allele frequencies by :math:`p_{k, m}`, and the genotypes by :math:`g_{n, m}`. The generative model in then given by:

        .. math::
            k_n \,&\sim\, \pi

            p_{0,m}\,&\sim\, P_0

            p_{k,m}\mid p_{0,m}\,&\sim\, \mathrm{Beta}(\mu = p_{0,m},\, \sigma^2 = F_k p_{0,m}(1 - p_{0,m}))

            g_{n,m}\mid k_n, p_{k, m} \,&\sim\, \mathrm{Binomial}(2, p_{k_n, m})

        We have parametrized the beta distribution by its mean and variance; the usual parameters are :math:`a = (1 - p)(1 - F)/F,\; b = p(1-F)/F` with :math:`F = F_k,\; p = p_{0,m}`.

        **Annotations**

        :py:meth:`~hail.HailContext.balding_nichols_model` adds the following global, sample, and variant annotations:

         - **global.nPops** (*Int*) -- Number of populations
         - **global.nSamples** (*Int*) -- Number of samples
         - **global.nVariants** (*Int*) -- Number of variants
         - **global.popDist** (*Array[Double]*) -- Normalized population distribution indexed by population
         - **global.Fst** (*Array[Double]*) -- F_st values indexed by population
         - **global.seed** (*Int*) -- Random seed
         - **global.ancestralAFDist** (*Struct*) -- Information about ancestral allele frequency distribution
         - **sa.pop** (*Int*) -- Population of sample
         - **va.ancestralAF** (*Double*) -- Ancestral allele frequency
         - **va.AF** (*Array[Double]*) -- Allele frequency indexed by population

        :param int populations: Number of populations.

        :param int samples: Number of samples.

        :param int variants: Number of variants.

        :param int partitions: Number of partitions.

        :param pop_dist: Unnormalized population distribution
        :type pop_dist: array of float or None

        :param fst: F_st values
        :type fst: array of float or None

        :param af_dist: Ancestral allele frequency distribution
        :type af_dist: :class:`.UniformDist` or :class:`.BetaDist` or :class:`.TruncatedBetaDist`

        :param int seed: Random seed.

        :rtype: :class:`.VariantDataset`
        :return: A VariantDataset generated by the Balding-Nichols model.
        """

        if pop_dist is None:
            jvm_pop_dist_opt = joption(pop_dist)
        else:
            jvm_pop_dist_opt = joption(jarray(self._jvm.double, pop_dist))

        if fst is None:
            jvm_fst_opt = joption(fst)
        else:
            jvm_fst_opt = joption(jarray(self._jvm.double, fst))

        return VariantDataset(
            self,
            self._hail.stats.BaldingNicholsModel.apply(self._jsc, populations,
                                                       samples, variants,
                                                       jvm_pop_dist_opt,
                                                       jvm_fst_opt, seed,
                                                       joption(partitions),
                                                       af_dist._jrep()))