示例#1
0
    def import_annotations_table(self,
                                 path,
                                 variant_expr,
                                 code=None,
                                 npartitions=None,
                                 config=TextTableConfig()):
        """Import variants and variant annotations from a delimited text file
        (text table) as a sites-only VariantDataset.

        :param path: The files to import.
        :type path: str or list of str

        :param str variant_expr: Expression to construct a variant
            from a row of the text table.  Must have type Variant.

        :param code: Expression to build the variant annotations.
        :type code: str or None

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig`

        :rtype: :class:`.VariantDataset`
        """

        jvds = self._jhc.importAnnotationsTables(jindexed_seq_args(path),
                                                 variant_expr, joption(code),
                                                 joption(npartitions),
                                                 config._to_java())
        return VariantDataset(self, jvds)
示例#2
0
文件: context.py 项目: Fedja/hail
    def read(self, path, sites_only=False, samples_only=False):
        """Read .vds files as variant dataset.

        When loading multiple .vds files, they must have the same
        sample IDs, genotype schema, split status and variant metadata.

        :param path: .vds files to read.
        :type path: str or list of str

        :param bool sites_only: If True, create sites-only variant
          dataset.  Don't load sample ids, sample annotations
          or gneotypes.

        :param bool samples_only: If True, create samples-only variant
          dataset (no variants or genotypes).

        :return: Variant dataset read from disk.
        :rtype: :class:`.VariantDataset`
        """

        metadata = self._jhc.readAllMetadata(jindexed_seq_args(path))
        is_generic_genotype = metadata[0]._1().isGenericGenotype()

        if is_generic_genotype:
            jvds = self._jhc.readAllGDS(jindexed_seq_args(path), sites_only,
                                        samples_only, joption(metadata))
        else:
            jvds = self._jhc.readAll(jindexed_seq_args(path), sites_only,
                                     samples_only, joption(metadata))

        return VariantDataset(self, jvds)
示例#3
0
文件: context.py 项目: ttasa/hail
    def read(self, path, drop_samples=False, drop_variants=False):
        """Read .vds files as variant dataset.

        When loading multiple VDS files, they must have the same
        sample IDs, genotype schema, split status and variant metadata.

        :param path: VDS files to read.
        :type path: str or list of str

        :param bool drop_samples: If True, create sites-only variant
          dataset.  Don't load sample ids, sample annotations
          or gneotypes.

        :param bool drop_variants: If True, create samples-only variant
          dataset (no variants or genotypes).

        :return: Variant dataset read from disk.
        :rtype: :class:`.VariantDataset`

        """

        return VariantDataset(
            self,
            self._jhc.readAll(jindexed_seq_args(path), drop_samples,
                              drop_variants))
示例#4
0
    def import_matrix(self, path, min_partitions=None, drop_samples=False, cell_type=None, missing="NA", has_row_id_name = False):
        """
        :param path: File(s) to read. Currently, takes 1 header line of column ids and subsequent lines of rowID, data... in TSV form where data can be parsed as an integer.
        :type path: str or list of str

        :param min_partitions: Number of partitions.
        :type min_partitions: int or None

        :param bool drop_samples: I don't know if this is relevant, but it only loads the row IDs. Default: False

        :param str cell_type: Tells function how to parse cell data. Can be Int32, Int64, Float32, Float64, or String. Default: Int64

        :param str missing: notation for cell with missing value. Default: "NA"

        :param str has_row_id_name: whether or not the table header has an entry for the Row IDs. Default: False

        :return: Variant dataset imported from file(s)
        :rtype: :py:class:`.VariantDataset`
        """

        if not cell_type:
            cell_type = TInt64()
        return VariantDataset(self,
                              self._jhc.importMatrices(jindexed_seq_args(path),
                                                       joption(min_partitions),
                                                       drop_samples,
                                                       cell_type._jtype,
                                                       missing,
                                                       has_row_id_name))
示例#5
0
    def import_gen(self, path, sample_file=None, tolerance=0.2, min_partitions=None, chromosome=None):
        """Import .gen file(s) as variant dataset.

        **Examples**

        Read a .gen file and a .sample file and write to a .vds file:

        >>> (hc.import_gen('data/example.gen', sample_file='data/example.sample')
        ...    .write('output/gen_example1.vds'))

        Load multiple files at the same time with :ref:`Hadoop glob patterns <sec-hadoop-glob>`:

        >>> (hc.import_gen('data/example.chr*.gen', sample_file='data/example.sample')
        ...    .write('output/gen_example2.vds'))

        **Notes**

        For more information on the .gen file format, see `here <http://www.stats.ox.ac.uk/%7Emarchini/software/gwas/file_format.html#mozTocId40300>`__.

        To ensure that the .gen file(s) and .sample file are correctly prepared for import:

        - If there are only 5 columns before the start of the genotype probability data (chromosome field is missing), you must specify the chromosome using the ``chromosome`` parameter

        - No duplicate sample IDs are allowed

        The first column in the .sample file is used as the sample ID ``s``.

        Also, see section in :py:meth:`~hail.HailContext.import_bgen` linked :ref:`here <gpfilters>` for information about Hail's genotype probability representation.

        **Annotations**

        :py:meth:`~hail.HailContext.import_gen` adds the following variant annotations:

         - **va.varid** (*String*) -- 2nd column of .gen file if chromosome present, otherwise 1st column.

         - **va.rsid** (*String*) -- 3rd column of .gen file if chromosome present, otherwise 2nd column.

        :param path: .gen files to import.
        :type path: str or list of str

        :param str sample_file: The sample file.

        :param float tolerance: If the sum of the genotype probabilities for a genotype differ from 1.0 by more than the tolerance, set the genotype to missing.

        :param min_partitions: Number of partitions.
        :type min_partitions: int or None

        :param chromosome: Chromosome if not listed in the .gen file.
        :type chromosome: str or None

        :return: Variant dataset imported from .gen and .sample files.
        :rtype: :class:`.VariantDataset`
        """

        jvds = self._jhc.importGens(jindexed_seq_args(path), sample_file, joption(chromosome), joption(min_partitions),
                                    tolerance)
        return VariantDataset(self, jvds)
示例#6
0
    def _run_command(self, vds, pargs):
        jargs = jarray(self._jvm.java.lang.String, pargs)
        t = self._hail.driver.ToplevelCommands.lookup(jargs)
        cmd = t._1()
        cmd_args = t._2()
        jstate = self._jstate(vds._jvds if vds != None else None)

        try:
            result = cmd.run(jstate, cmd_args)
        except Py4JJavaError as e:
            raise_py4j_exception(e)

        return VariantDataset(self, result.vds())
示例#7
0
    def read(self, path, drop_samples=False, drop_variants=False):
        """Read .vds file as a variant dataset.

        :param str path: VDS file to read.

        :param bool drop_samples: If True, create sites-only variant
          dataset.  Don't load sample ids, sample annotations
          or gneotypes.

        :param bool drop_variants: If True, create samples-only variant
          dataset (no variants or genotypes).

        :return: Variant dataset read from disk.
        :rtype: :class:`.VariantDataset`

        """

        return VariantDataset(
            self,
            self._jhc.read(path, drop_samples, drop_variants))
示例#8
0
    def read(self, path, sites_only=False, samples_only=False):
        """Read .vds files as VariantDataset

        When loading multiple .vds files, they must have the same
        sample IDs, split status and variant metadata.

        :param path: .vds files to read.
        :type path: str or list of str

        :param bool sites_only: If True, create sites-only
          dataset.  Don't load sample ids, sample annotations
          or gneotypes.

        :param bool samples_only: If True, create samples-only
          dataset (no variants or genotypes).

        :return: A dataset read from disk
        :rtype: :class:`.VariantDataset`
        """

        jvds = self._jhc.readAll(jindexed_seq_args(path), sites_only,
                                 samples_only)
        return VariantDataset(self, jvds)
示例#9
0
    def balding_nichols_model(self, populations, samples, variants, num_partitions=None,
                              pop_dist=None, fst=None, af_dist=UniformDist(0.1, 0.9),
                              seed=0):
        """Simulate a variant dataset using the Balding-Nichols model.

        **Examples**

        To generate a VDS with 3 populations, 100 samples in total, and 1000 variants:

        >>> vds = hc.balding_nichols_model(3, 100, 1000)

        To generate a VDS with 4 populations, 2000 samples, 5000 variants, 10 partitions, population distribution [0.1, 0.2, 0.3, 0.4], :math:`F_{ST}` values [.02, .06, .04, .12], ancestral allele frequencies drawn from a truncated beta distribution with a = .01 and b = .05 over the interval [0.05, 1], and random seed 1:

        >>> from hail.stats import TruncatedBetaDist
        >>> vds = hc.balding_nichols_model(4, 40, 150, 10,
        ...                                pop_dist=[0.1, 0.2, 0.3, 0.4],
        ...                                fst=[.02, .06, .04, .12],
        ...                                af_dist=TruncatedBetaDist(a=0.01, b=2.0, minVal=0.05, maxVal=1.0),
        ...                                seed=1)

        **Notes**

        Hail is able to randomly generate a VDS using the Balding-Nichols model.

        - :math:`K` populations are labeled by integers 0, 1, ..., K - 1
        - :math:`N` samples are named by strings 0, 1, ..., N - 1
        - :math:`M` variants are defined as ``1:1:A:C``, ``1:2:A:C``, ..., ``1:M:A:C``
        - The default ancestral frequency distribution :math:`P_0` is uniform on [0.1, 0.9]. Options are UniformDist(minVal, maxVal), BetaDist(a, b), and TruncatedBetaDist(a, b, minVal, maxVal). All three classes are located in hail.stats.
        - The population distribution :math:`\pi` defaults to uniform
        - The :math:`F_{ST}` values default to 0.1
        - The number of partitions defaults to one partition per million genotypes (i.e., samples * variants / 10^6) or 8, whichever is larger

        The Balding-Nichols model models genotypes of individuals from a structured population comprising :math:`K` homogeneous subpopulations
        that have each diverged from a single ancestral population (a `star phylogeny`). We take :math:`N` samples and :math:`M` bi-allelic variants in perfect
        linkage equilibrium. The relative sizes of the subpopulations are given by a probability vector :math:`\pi`; the ancestral allele frequencies are
        drawn independently from a frequency spectrum :math:`P_0`; the subpopulations have diverged with possibly different :math:`F_{ST}` parameters :math:`F_k`
        (here and below, lowercase indices run over a range bounded by the corresponding uppercase parameter, e.g. :math:`k = 1, \ldots, K`).
        For each variant, the subpopulation allele frequencies are drawn a `beta distribution <https://en.wikipedia.org/wiki/Beta_distribution>`__, a useful continuous approximation of
        the effect of genetic drift. We denote the individual subpopulation memberships by :math:`k_n`, the ancestral allele frequences by :math:`p_{0, m}`,
        the subpopulation allele frequencies by :math:`p_{k, m}`, and the genotypes by :math:`g_{n, m}`. The generative model in then given by:

        .. math::
            k_n \,&\sim\, \pi

            p_{0,m}\,&\sim\, P_0

            p_{k,m}\mid p_{0,m}\,&\sim\, \mathrm{Beta}(\mu = p_{0,m},\, \sigma^2 = F_k p_{0,m}(1 - p_{0,m}))

            g_{n,m}\mid k_n, p_{k, m} \,&\sim\, \mathrm{Binomial}(2, p_{k_n, m})

        We have parametrized the beta distribution by its mean and variance; the usual parameters are :math:`a = (1 - p)(1 - F)/F,\; b = p(1-F)/F` with :math:`F = F_k,\; p = p_{0,m}`.

        **Annotations**

        :py:meth:`~hail.HailContext.balding_nichols_model` adds the following global, sample, and variant annotations:

         - **global.nPops** (*Int*) -- Number of populations
         - **global.nSamples** (*Int*) -- Number of samples
         - **global.nVariants** (*Int*) -- Number of variants
         - **global.popDist** (*Array[Double]*) -- Normalized population distribution indexed by population
         - **global.Fst** (*Array[Double]*) -- :math:`F_{ST}` values indexed by population
         - **global.seed** (*Int*) -- Random seed
         - **global.ancestralAFDist** (*Struct*) -- Description of the ancestral allele frequency distribution
         - **sa.pop** (*Int*) -- Population of sample
         - **va.ancestralAF** (*Double*) -- Ancestral allele frequency
         - **va.AF** (*Array[Double]*) -- Allele frequency indexed by population

        :param int populations: Number of populations.

        :param int samples: Number of samples.

        :param int variants: Number of variants.

        :param int num_partitions: Number of partitions.

        :param pop_dist: Unnormalized population distribution
        :type pop_dist: array of float or None

        :param fst: :math:`F_{ST}` values
        :type fst: array of float or None

        :param af_dist: Ancestral allele frequency distribution
        :type af_dist: :class:`.UniformDist` or :class:`.BetaDist` or :class:`.TruncatedBetaDist`

        :param int seed: Random seed.

        :return: Variant dataset simulated using the Balding-Nichols model.
        :rtype: :class:`.VariantDataset`
        """

        if pop_dist is None:
            jvm_pop_dist_opt = joption(pop_dist)
        else:
            jvm_pop_dist_opt = joption(jarray(self._jvm.double, pop_dist))

        if fst is None:
            jvm_fst_opt = joption(fst)
        else:
            jvm_fst_opt = joption(jarray(self._jvm.double, fst))

        jvds = self._jhc.baldingNicholsModel(populations, samples, variants,
                                             joption(num_partitions),
                                             jvm_pop_dist_opt,
                                             jvm_fst_opt,
                                             af_dist._jrep(),
                                             seed)
        return VariantDataset(self, jvds)
示例#10
0
    def import_vcf(self, path, force=False, force_bgz=False, header_file=None, min_partitions=None,
                   drop_samples=False, generic=False, call_fields=[]):
        """Import VCF file(s) as variant dataset.

        **Examples**

        >>> vds = hc.import_vcf('data/example2.vcf.bgz')

        **Notes**

        Hail is designed to be maximally compatible with files in the `VCF v4.2 spec <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`__.

        :py:meth:`~hail.HailContext.import_vcf` takes a list of VCF files to load. All files must have the same header and the same set of samples in the same order
        (e.g., a variant dataset split by chromosome). Files can be specified as :ref:`Hadoop glob patterns <sec-hadoop-glob>`.

        Ensure that the VCF file is correctly prepared for import: VCFs should either be uncompressed (*.vcf*) or block compressed
        (*.vcf.bgz*).  If you have a large compressed VCF that ends in *.vcf.gz*, it is likely that the file is actually block-compressed,
        and you should rename the file to ".vcf.bgz" accordingly. If you actually have a standard gzipped file, it is possible to import
        it to Hail using the ``force`` optional parameter. However, this is not recommended -- all parsing will have to take place on one node because
        gzip decompression is not parallelizable. In this case, import could take significantly longer.

        If ``generic`` equals False (default), Hail makes certain assumptions about the genotype fields, see :class:`Representation <hail.representation.Genotype>`. On import, Hail filters
        (sets to no-call) any genotype that violates these assumptions. Hail interprets the format fields: GT, AD, OD, DP, GQ, PL; all others are
        silently dropped.

        If ``generic`` equals True, the genotype schema is a :py:class:`~hail.type.TStruct` with field names equal to the IDs of the FORMAT fields.
        The ``GT`` field is automatically read in as a :py:class:`~hail.type.TCall` type. To specify additional fields to import as a
        :py:class:`~hail.type.TCall` type, use the ``call_fields`` parameter. All other fields are imported as the type specified in the FORMAT header field.

        An example genotype schema after importing a VCF with ``generic=True`` is

        .. code-block:: text

            Struct {
                GT: Call,
                AD: Array[Int],
                DP: Int,
                GQ: Int,
                PL: Array[Int]
            }

        .. warning::

            - The variant dataset generated with ``generic=True`` will have significantly slower performance.

            - Not all :py:class:`.VariantDataset` methods will work with a generic genotype schema.

        :py:meth:`~hail.HailContext.import_vcf` does not perform deduplication - if the provided VCF(s) contain multiple records with the same chrom, pos, ref, alt, all
        these records will be imported and will not be collapsed into a single variant.

        Since Hail's genotype representation does not yet support ploidy other than 2,
        this method imports haploid genotypes as diploid. If ``generic=False``, Hail fills in missing indices
        in PL / PP arrays with 1000 to support the standard VCF / VDS "genotype schema.

        Below are two example haploid genotypes and diploid equivalents that Hail sees.

        .. code-block:: text

            Haploid:     1:0,6:7:70:70,0
            Imported as: 1/1:0,6:7:70:70,1000,0

            Haploid:     2:0,0,9:9:24:24,40,0
            Imported as: 2/2:0,0,9:9:24:24,1000,40,1000:1000:0


        .. note::
            
            Using the **FILTER** field:
            
            The information in the FILTER field of a VCF is contained in the ``va.filters`` annotation.
            This annotation is a ``Set`` and can be queried for filter membership with expressions 
            like ``va.filters.contains("VQSRTranche99.5...")``. Variants that are flagged as "PASS" 
            will have no filters applied; for these variants, ``va.filters.isEmpty()`` is true. Thus, 
            filtering to PASS variants can be done with :py:meth:`.VariantDataset.filter_variants_expr`
            as follows:
            
            >>> pass_vds = vds.filter_variants_expr('va.filters.isEmpty()', keep=True)

        **Annotations**

        - **va.filters** (*Set[String]*) -- Set containing all filters applied to a variant. 
        - **va.rsid** (*String*) -- rsID of the variant.
        - **va.qual** (*Double*) -- Floating-point number in the QUAL field.
        - **va.info** (*Struct*) -- All INFO fields defined in the VCF header
          can be found in the struct ``va.info``. Data types match the type
          specified in the VCF header, and if the declared ``Number`` is not
          1, the result will be stored as an array.

        :param path: VCF file(s) to read.
        :type path: str or list of str

        :param bool force: If True, load .gz files serially. This means that no downstream operations
            can be parallelized, so using this mode is strongly discouraged for VCFs larger than a few MB.

        :param bool force_bgz: If True, load .gz files as blocked gzip files (BGZF)

        :param header_file: File to load VCF header from.  If not specified, the first file in path is used.
        :type header_file: str or None

        :param min_partitions: Number of partitions.
        :type min_partitions: int or None

        :param bool drop_samples: If True, create sites-only variant
            dataset.  Don't load sample ids, sample annotations or
            genotypes.

        :param call_fields: FORMAT fields in VCF to treat as a :py:class:`~hail.type.TCall`. Only applies if ``generic=True``.
        :type call_fields: str or list of str

        :param bool generic: If True, read the genotype with a generic schema.

        :return: Variant dataset imported from VCF file(s)
        :rtype: :py:class:`.VariantDataset`

        """

        if generic:
            jvds = self._jhc.importVCFsGeneric(jindexed_seq_args(path), force, force_bgz, joption(header_file),
                                               joption(min_partitions), drop_samples, jset_args(call_fields))
        else:
            jvds = self._jhc.importVCFs(jindexed_seq_args(path), force, force_bgz, joption(header_file),
                                        joption(min_partitions), drop_samples)

        return VariantDataset(self, jvds)
示例#11
0
    def import_plink(self, bed, bim, fam, min_partitions=None, delimiter='\\\\s+', missing='NA', quantpheno=False):
        """Import PLINK binary file (BED, BIM, FAM) as variant dataset.

        **Examples**

        Import data from a PLINK binary file:

        >>> vds = hc.import_plink(bed="data/test.bed",
        ...                       bim="data/test.bim",
        ...                       fam="data/test.fam")

        **Notes**

        Only binary SNP-major mode files can be read into Hail. To convert your file from individual-major mode to SNP-major mode, use PLINK to read in your fileset and use the ``--make-bed`` option.

        The centiMorgan position is not currently used in Hail (Column 3 in BIM file).

        The ID (``s``) used by Hail is the individual ID (column 2 in FAM file).

        .. warning::

            No duplicate individual IDs are allowed.

        Chromosome names (Column 1) are automatically converted in the following cases:

          - 23 => "X"
          - 24 => "Y"
          - 25 => "X"
          - 26 => "MT"

        **Annotations**

        :py:meth:`~hail.HailContext.import_plink` adds the following annotations:

         - **va.rsid** (*String*) -- Column 2 in the BIM file.
         - **sa.famID** (*String*) -- Column 1 in the FAM file. Set to missing if ID equals "0".
         - **sa.patID** (*String*) -- Column 3 in the FAM file. Set to missing if ID equals "0".
         - **sa.matID** (*String*) -- Column 4 in the FAM file. Set to missing if ID equals "0".
         - **sa.isFemale** (*String*) -- Column 5 in the FAM file. Set to missing if value equals "-9", "0", or "N/A".
           Set to true if value equals "2". Set to false if value equals "1".
         - **sa.isCase** (*String*) -- Column 6 in the FAM file. Only present if ``quantpheno`` equals False.
           Set to missing if value equals "-9", "0", "N/A", or the value specified by ``missing``.
           Set to true if value equals "2". Set to false if value equals "1".
         - **sa.qPheno** (*String*) -- Column 6 in the FAM file. Only present if ``quantpheno`` equals True.
           Set to missing if value equals ``missing``.

        :param str bed: PLINK BED file.

        :param str bim: PLINK BIM file.

        :param str fam: PLINK FAM file.

        :param min_partitions: Number of partitions.
        :type min_partitions: int or None

        :param str missing: The string used to denote missing values **only** for the phenotype field. This is in addition to "-9", "0", and "N/A" for case-control phenotypes.

        :param str delimiter: FAM file field delimiter regex.

        :param bool quantpheno: If True, FAM phenotype is interpreted as quantitative.

        :return: Variant dataset imported from PLINK binary file.
        :rtype: :class:`.VariantDataset`
        """

        jvds = self._jhc.importPlink(bed, bim, fam, joption(min_partitions), delimiter, missing, quantpheno)

        return VariantDataset(self, jvds)
示例#12
0
    def import_bgen(self, path, tolerance=0.2, sample_file=None, min_partitions=None):
        """Import .bgen file(s) as variant dataset.

        **Examples**

        Importing a BGEN file as a VDS (assuming it has already been indexed).

        >>> vds = hc.import_bgen("data/example3.bgen", sample_file="data/example3.sample")

        **Notes**

        Hail supports importing data in the BGEN file format. For more information on the BGEN file format,
        see `here <http://www.well.ox.ac.uk/~gav/bgen_format/bgen_format.html>`__. Note that only v1.1 and v1.2 BGEN files
        are supported at this time. For v1.2 BGEN files, only **unphased** and **diploid** genotype probabilities are allowed and the
        genotype probability blocks must be either compressed with zlib or uncompressed.

        Before importing, ensure that:

          - The sample file has the same number of samples as the BGEN file.
          - No duplicate sample IDs are present.

        To load multiple files at the same time, use :ref:`Hadoop Glob Patterns <sec-hadoop-glob>`.

        .. _gpfilters:

        **Genotype probability (``gp``) representation**:

        The following modifications are made to genotype probabilities in BGEN v1.1 files:

          - Since genotype probabilities are understood to define a probability distribution, :py:meth:`~hail.HailContext.import_bgen` automatically sets to missing those genotypes for which the sum of the probabilities is a distance greater than the ``tolerance`` parameter from 1.0.  The default tolerance is 0.2, so a genotype with sum .79 or 1.21 is filtered out, whereas a genotype with sum .8 or 1.2 remains.

          - :py:meth:`~hail.HailContext.import_bgen` normalizes all probabilities to sum to 1.0. Therefore, an input distribution of (0.98, 0.0, 0.0) will be stored as (1.0, 0.0, 0.0) in Hail.

        **Annotations**

        :py:meth:`~hail.HailContext.import_bgen` adds the following variant annotations:

         - **va.varid** (*String*) -- 2nd column of .gen file if chromosome present, otherwise 1st column.

         - **va.rsid** (*String*) -- 3rd column of .gen file if chromosome present, otherwise 2nd column.

        :param path: .bgen files to import.
        :type path: str or list of str

        :param float tolerance: If the sum of the probabilities for a
            genotype differ from 1.0 by more than the tolerance, set
            the genotype to missing. Only applicable if the BGEN files are v1.1.

        :param sample_file: Sample file.
        :type sample_file: str or None

        :param min_partitions: Number of partitions.
        :type min_partitions: int or None

        :return: Variant dataset imported from .bgen file.
        :rtype: :class:`.VariantDataset`
        """

        jvds = self._jhc.importBgens(jindexed_seq_args(path), joption(sample_file),
                                     tolerance, joption(min_partitions))
        return VariantDataset(self, jvds)
示例#13
0
文件: context.py 项目: Fedja/hail
    def import_bgen(self,
                    path,
                    tolerance=0.2,
                    sample_file=None,
                    npartitions=None):
        """Import .bgen file(s) as variant dataset.

        **Examples**

        Importing a BGEN file as a VDS (assuming it has already been indexed).

        >>> vds = hc.import_bgen("data/example3.bgen", sample_file="data/example3.sample")

        **Notes**

        Hail supports importing data in the BGEN file format. For more information on the BGEN file format,
        see `here <http://www.well.ox.ac.uk/~gav/bgen_format/bgen_format_v1.1.html>`_. Note that only v1.1 BGEN files
        are supported at this time.

        Before importing, ensure that:
        - Files reside in Hadoop file system.
        - The sample file has the same number of samples as the BGEN file.
        - No duplicate sample IDs are present.

        To load multiple files at the same time, use :ref:`Hadoop Glob Patterns <sec-hadoop-glob>`.

        .. _dosagefilters:

        **Dosage representation**:

        Since dosages are understood as genotype probabilities, :py:meth:`~hail.HailContext.import_bgen` automatically sets to missing those genotypes for which the sum of the dosages is a distance greater than the ``tolerance`` parameter from 1.0.  The default tolerance is 0.2, so a genotype with sum .79 or 1.21 is filtered out, whereas a genotype with sum .8 or 1.2 remains.

        :py:meth:`~hail.HailContext.import_gen` normalizes all dosages to sum to 1.0. Therefore, an input dosage of (0.98, 0.0, 0.0) will be stored as (1.0, 0.0, 0.0) in Hail.

        Even when the dosages sum to 1.0, Hail may store slightly different values than the original GEN file (maximum observed difference is 3E-4).

        **Annotations**

        :py:meth:`~hail.HailContext.import_bgen` adds the following variant annotations:

         - **va.varid** (*String*) -- 2nd column of .gen file if chromosome present, otherwise 1st column.

         - **va.rsid** (*String*) -- 3rd column of .gen file if chromosome present, otherwise 2nd column.

        :param path: .bgen files to import.
        :type path: str or list of str

        :param float tolerance: If the sum of the dosages for a
            genotype differ from 1.0 by more than the tolerance, set
            the genotype to missing.

        :param sample_file: Sample file.
        :type sample_file: str or None

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :return: Variant dataset imported from .bgen file.
        :rtype: :class:`.VariantDataset`
        """

        jvds = self._jhc.importBgens(jindexed_seq_args(path),
                                     joption(sample_file), tolerance,
                                     joption(npartitions))
        return VariantDataset(self, jvds)
示例#14
0
    def import_vcf(self,
                   path,
                   force=False,
                   force_bgz=False,
                   header_file=None,
                   npartitions=None,
                   sites_only=False,
                   store_gq=False,
                   pp_as_pl=False,
                   skip_bad_ad=False):
        """Import VCF file(s) as :py:class:`.VariantDataset`

        **Examples**

        >>> vds = hc.import_vcf('data/example2.vcf.bgz')

        **Notes**

        Hail is designed to be maximally compatible with files in the `VCF v4.2 spec <https://samtools.github.io/hts-specs/VCFv4.2.pdf>`_.

        :py:meth:`~hail.HailContext.import_vcf` takes a list of VCF files to load. All files must have the same header and the same set of samples in the same order
        (e.g., a dataset split by chromosome). Files can be specified as :ref:`Hadoop glob patterns <sec-hadoop-glob>`.

        Ensure that the VCF file is correctly prepared for import: VCFs should either be uncompressed (*.vcf*) or block compressed
        (*.vcf.bgz*).  If you have a large compressed VCF that ends in *.vcf.gz*, it is likely that the file is actually block-compressed,
        and you should rename the file to ".vcf.bgz" accordingly. If you actually have a standard gzipped file, it is possible to import
        it to Hail using the ``force`` optional parameter. However, this is not recommended -- all parsing will have to take place on one node because
        gzip decompression is not parallelizable. In this case, import could take significantly longer.

        Hail makes certain assumptions about the genotype fields, see :class:`Representation <hail.representation.Genotype>`. On import, Hail filters
        (sets to no-call) any genotype that violates these assumptions. Hail interprets the format fields: GT, AD, OD, DP, GQ, PL; all others are
        silently dropped.

        :py:meth:`~hail.HailContext.import_vcf` does not perform deduplication - if the provided VCF(s) contain multiple records with the same chrom, pos, ref, alt, all
        these records will be imported and will not be collapsed into a single variant.

        Since Hail's genotype representation does not yet support ploidy other than 2,
        this method imports haploid genotypes as diploid. Hail fills in missing indices
        in PL / PP arrays with 1000 to support the standard VCF / VDS "genotype schema.

        Below are two example haploid genotypes and diploid equivalents that Hail sees.

        .. code-block:: text

            Haploid:     1:0,6:7:70:70,0
            Imported as: 1/1:0,6:7:70:70,1000,0

            Haploid:     2:0,0,9:9:24:24,40,0
            Imported as: 2/2:0,0,9:9:24:24,1000,40,1000:1000:0


        **Annotations**

        - **va.pass** (*Boolean*) -- true if the variant contains `PASS` in the filter field (false if `.` or other)
        - **va.filters** (*Set[String]*) -- set containing the list of filters applied to a variant. Accessible using `va.filters.contains("VQSRTranche99.5...")`, for example
        - **va.rsid** (*String*) -- rsid of the variant, if it has one ("." otherwise)
        - **va.qual** (*Double*) -- the number in the qual field
        - **va.info** (*T*) -- matches (with proper capitalization) any defined info field. Data types match the type specified in the vcf header, and if the `Number` is "A", "R", or "G", the result will be stored in an array (accessed with array\[index\]).

        :param path: VCF file(s) to read.
        :type path: str or list of str

        :param bool force: If True, load .gz files serially.

        :param bool force_bgz: If True, load .gz files as blocked gzip files (BGZF)

        :param header_file: File to load VCF header from.  If not specified, the first file in path is used.
        :type header_file: str or None

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param bool sites_only: If True, create sites-only
            VariantDataset.  Don't load sample ids, sample annotations
            or genotypes.

        :param bool store_gq: If True, store GQ FORMAT field instead of computing from PL.

        :param bool pp_as_pl: If True, store PP FORMAT field as PL.  EXPERIMENTAL.

        :param bool skip_bad_ad: If True, set AD FORMAT field with
            wrong number of elements to missing, rather than setting
            the entire genotype to missing.

        :return: A dataset imported from the VCF file
        :rtype: :py:class:`.VariantDataset`
        """

        jvds = self._jhc.importVCFs(jindexed_seq_args(path), force, force_bgz,
                                    joption(header_file), joption(npartitions),
                                    sites_only, store_gq, pp_as_pl,
                                    skip_bad_ad)

        return VariantDataset(self, jvds)