示例#1
0
    def from_fasta_file(cls, name, fasta_file, index_file,
                        x_contigs=[], y_contigs=[], mt_contigs=[], par=[]):
        """Create reference genome from a FASTA file.
        
        Parameters
        ----------
        name: :obj:`str`
            Name for new reference genome.
        fasta_file : :obj:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :obj:`str`
            Path to FASTA index file. Must be uncompressed.
        x_contigs : :obj:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as X chromosomes.
        y_contigs : :obj:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as Y chromosomes.
        mt_contigs : :obj:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as mitochondrial DNA.
        par : :obj:`list` of :obj:`tuple` of (str, int, int)
            List of tuples with (contig, start, end)

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        par_strings = ["{}:{}-{}".format(contig, start, end) for (contig, start, end) in par]
        Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings)
        
        rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True)
        rg._has_sequence = True
        return rg
示例#2
0
def write_block_matrices(bms: List[BlockMatrix],
                         path_prefix: str,
                         overwrite: bool = False,
                         force_row_major: bool = False,
                         stage_locally: bool = False):
    """Writes a sequence of block matrices to disk in the same format as BlockMatrix.write.

    :param bms: :obj:`list` of :class:`BlockMatrix`
        Block matrices to write to disk.
    :param path_prefix: obj:`str`
        Prefix of path to write the block matrices to.
    :param overwrite: obj:`bool`
        If true, overwrite any files with the same name as the block matrices being generated.
    :param force_row_major: obj:`bool`
        If ``True``, transform blocks in column-major format
        to row-major format before writing.
        If ``False``, write blocks in their current format.
    :param stage_locally: :obj:`bool`
        If ``True``, major output will be written to temporary local storage
        before being copied to ``output``.
    """
    writer = BlockMatrixNativeMultiWriter(path_prefix, overwrite,
                                          force_row_major, stage_locally)
    Env.backend().execute(
        BlockMatrixMultiWrite([bm._bmir for bm in bms], writer))
示例#3
0
 def __del__(self):
     try:
         Env.backend()._jhc.pyRemoveIrVector(self.jid)
     # there is only so much we can do if the attempt to remove the unused IR fails,
     # especially since this will often get called during interpreter shutdown.
     except Exception:
         pass
示例#4
0
def eval_timed(expression):
    """Evaluate a Hail expression, returning the result and the times taken for
    each stage in the evaluation process.

    Parameters
    ----------
    expression : :class:`.Expression`
        Any expression, or a Python value that can be implicitly interpreted as an expression.

    Returns
    -------
    (Any, dict)
        Result of evaluating `expression` and a dictionary of the timings
    """
    from hail.utils.java import Env

    analyze('eval_timed', expression, Indices(expression._indices.source))

    if expression._indices.source is None:
        ir_type = expression._ir.typ
        expression_type = expression.dtype
        if ir_type != expression.dtype:
            raise ExpressionException(
                f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}'
            )
        return Env.backend().execute(expression._ir, True)
    else:
        uid = Env.get_uid()
        ir = expression._indices.source.select_globals(**{
            uid: expression
        }).index_globals()[uid]._ir
        return Env.backend().execute(ir, True)
示例#5
0
    def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False):
        super(ReferenceGenome, self).__init__()
        
        contigs = wrap_to_list(contigs)
        x_contigs = wrap_to_list(x_contigs)
        y_contigs = wrap_to_list(y_contigs)
        mt_contigs = wrap_to_list(mt_contigs)

        self._config = {
            'name': name,
            'contigs': [{'name': c, 'length': l} for c, l in lengths.items()],
            'xContigs': x_contigs,
            'yContigs': y_contigs,
            'mtContigs': mt_contigs,
            'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par]
        }

        self._contigs = contigs
        self._lengths = lengths
        self._par_tuple = par
        self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par]

        ReferenceGenome._references[name] = self

        if not _builtin:
            Env.backend().add_reference(self._config)

        hl.ir.register_reference_genome_functions(name)

        self._has_sequence = False
        self._liftovers = set()
示例#6
0
def eval_timed(expression):
    """Evaluate a Hail expression, returning the result and the times taken for
    each stage in the evaluation process.

    Parameters
    ----------
    expression : :class:`.Expression`
        Any expression, or a Python value that can be implicitly interpreted as an expression.

    Returns
    -------
    (Any, dict)
        Result of evaluating `expression` and a dictionary of the timings
    """
    from hail.utils.java import Env

    analyze('eval_timed', expression, Indices(expression._indices.source))

    if expression._indices.source is None:
        ir_type = expression._ir.typ
        expression_type = expression.dtype
        if ir_type != expression.dtype:
            raise ExpressionException(f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}')
        return Env.backend().execute(expression._ir, True)
    else:
        uid = Env.get_uid()
        ir = expression._indices.source.select_globals(**{uid: expression}).index_globals()[uid]._ir
        return Env.backend().execute(ir, True)
示例#7
0
    def __init__(self,
                 name,
                 contigs,
                 lengths,
                 x_contigs=[],
                 y_contigs=[],
                 mt_contigs=[],
                 par=[],
                 _builtin=False):
        super(ReferenceGenome, self).__init__()

        contigs = wrap_to_list(contigs)
        x_contigs = wrap_to_list(x_contigs)
        y_contigs = wrap_to_list(y_contigs)
        mt_contigs = wrap_to_list(mt_contigs)

        self._config = {
            'name':
            name,
            'contigs': [{
                'name': c,
                'length': l
            } for c, l in lengths.items()],
            'xContigs':
            x_contigs,
            'yContigs':
            y_contigs,
            'mtContigs':
            mt_contigs,
            'par': [{
                'start': {
                    'contig': c,
                    'position': s
                },
                'end': {
                    'contig': c,
                    'position': e
                }
            } for (c, s, e) in par]
        }

        self._contigs = contigs
        self._lengths = lengths
        self._par_tuple = par
        self._par = [
            hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self))
            for (c, s, e) in par
        ]
        self._global_positions = None

        ReferenceGenome._references[name] = self

        if not _builtin:
            Env.backend().add_reference(self._config)

        hl.ir.register_reference_genome_functions(name)

        self._sequence_files = None
        self._liftovers = dict()
示例#8
0
    def remove_sequence(self):
        """Remove the reference sequence.

        Returns
        -------
        :obj:`bool`
        """
        self._has_sequence = False
        Env.backend().remove_sequence(self.name)
示例#9
0
def export_block_matrices(bms: List[BlockMatrix],
                          prefix: str,
                          overwrite: bool = False,
                          delimiter: str = '\t',
                          header: Optional[str] = None,
                          add_index: bool = False):
    writer = BlockMatrixTextMultiWriter(prefix, overwrite, delimiter, header,
                                        add_index)
    Env.backend().execute(
        BlockMatrixMultiWrite([bm._bmir for bm in bms], writer))
示例#10
0
    def remove_liftover(self, dest_reference_genome):
        """Remove liftover to `dest_reference_genome`.

        Parameters
        ----------
        dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome`
        """
        if dest_reference_genome.name in self._liftovers:
            self._liftovers.remove(dest_reference_genome.name)
            Env.backend().remove_liftover(self.name, dest_reference_genome.name)
示例#11
0
    def add_sequence(self, fasta_file, index_file=None):
        """Load the reference sequence from a FASTA file.

        Examples
        --------
        Access the GRCh37 reference genome using :func:`.get_reference`:

        >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP

        Add a sequence file:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz',
        ...                 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP

        Add a sequence file with the default index location:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP


        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_sequence` to test whether a sequence is loaded.

        FASTA and index files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37**

        - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz``
        - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai``

        **GRCh38**

        - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz``
        - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai``

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        fasta_file : :obj:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :obj:`None` or :obj:`str`
            Path to FASTA index file. Must be uncompressed. If `None`, replace
            the fasta_file's extension with `fai`.
        """
        if index_file is None:
            index_file = re.sub('\.[^.]*$', '.fai', fasta_file)
        Env.backend().add_sequence(self.name, fasta_file, index_file)
        self._has_sequence = True
示例#12
0
    def add_liftover(self, chain_file, dest_reference_genome):
        """Register a chain file for liftover.

        Examples
        --------
        Access GRCh37 and GRCh38 using :func:`.get_reference`:

        >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP
        >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP

        Add a chain file from 37 to 38:

        >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP

        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_liftover` to test whether a chain file has been registered.

        The chain file format is described
        `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__.

        Chain files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37 to GRCh38**
        gs://hail-common/references/grch37_to_grch38.over.chain.gz

        **GRCh38 to GRCh37**
        gs://hail-common/references/grch38_to_grch37.over.chain.gz

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        chain_file : :obj:`str`
            Path to chain file. Can be compressed (GZIP) or uncompressed.
        dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome`
            Reference genome to convert to.
        """

        Env.backend().add_liftover(self.name, chain_file,
                                   dest_reference_genome.name)
        if dest_reference_genome.name in self._liftovers:
            raise KeyError(
                f"Liftover already exists from {self.name} to {dest_reference_genome.name}."
            )
        self._liftovers[dest_reference_genome.name] = chain_file
        hl.ir.register_liftover_functions(self.name,
                                          dest_reference_genome.name)
示例#13
0
文件: table_ir.py 项目: jigold/hail
 def _compute_type(self):
     name = self.config['name']
     if name == 'TableFilterPartitions' or name == 'TableFilterIntervals':
         self._type = self.child.typ
     else:
         assert name in ('VEP', 'Nirvana'), name
         self._type = Env.backend().table_type(self)
示例#14
0
def eval_typed(expression):
    """Evaluate a Hail expression, returning the result and the type of the result.

    This method is extremely useful for learning about Hail expressions and understanding
    how to compose them.

    The expression must have no indices, but can refer to the globals
    of a :class:`.hail.Table` or :class:`.hail.MatrixTable`.

    Examples
    --------
    Evaluate a conditional:

    >>> x = 6
    >>> hl.eval_typed(hl.cond(x % 2 == 0, 'Even', 'Odd'))
    ('Even', dtype('str'))

    Parameters
    ----------
    expression : :class:`.Expression`
        Any expression, or a Python value that can be implicitly interpreted as an expression.

    Returns
    -------
    (any, :class:`.HailType`)
        Result of evaluating `expression`, and its type.

    """
    analyze('eval_typed', expression, Indices(expression._indices.source))

    if expression._indices.source is None:
        return (Env.backend().execute(expression._ir), expression.dtype)
    else:
        return expression.collect()[0], expression.dtype
示例#15
0
def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite):
    starts_and_stops = hl.linalg.utils.locus_windows(ht.locus,
                                                     radius,
                                                     _localize=False)

    # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595
    # for the time being, until efficient BlockMatrix filtering gets an easier interface
    # This is required, as the squaring/multiplication densifies, so this re-sparsifies.
    r2_adj = BlockMatrix._from_java(
        r2_adj._jbm.filterRowIntervalsIR(
            Env.backend()._to_java_ir(starts_and_stops._ir), False))

    l2row = r2_adj.sum(axis=0).T
    l2col = r2_adj.sum(axis=1)
    l2 = l2row + l2col + 1
    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()

    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'})
    ht_scores = ht_scores.key_by('idx')
    ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals()
    ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite)
示例#16
0
文件: context.py 项目: tpoterba/hail
def spark_context():
    """Returns the active Spark context.

    Returns
    -------
    :class:`pyspark.SparkContext`
    """
    return Env.backend().sc
示例#17
0
    def add_liftover(self, chain_file, dest_reference_genome):
        """Register a chain file for liftover.

        Examples
        --------
        Access GRCh37 and GRCh38 using :func:`.get_reference`:

        >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP
        >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP

        Add a chain file from 37 to 38:

        >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP

        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_liftover` to test whether a chain file has been registered.

        The chain file format is described
        `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__.

        Chain files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37 to GRCh38**
        gs://hail-common/references/grch37_to_grch38.over.chain.gz

        **GRCh38 to GRCh37**
        gs://hail-common/references/grch38_to_grch37.over.chain.gz

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        chain_file : :obj:`str`
            Path to chain file. Can be compressed (GZIP) or uncompressed.
        dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome`
            Reference genome to convert to.
        """

        Env.backend().add_liftover(self.name, chain_file, dest_reference_genome.name)
        self._liftovers.add(dest_reference_genome.name)
        hl.ir.register_liftover_functions(self.name, dest_reference_genome.name)
示例#18
0
def define_function(f, *param_types, _name=None, type_args=()):
    mname = _name if _name is not None else Env.get_uid()
    param_names = [Env.get_uid(mname) for _ in param_types]
    body = f(*(construct_expr(Ref(pn), pt)
               for pn, pt in zip(param_names, param_types)))
    ret_type = body.dtype

    Env.backend().register_ir_function(mname, type_args, param_names,
                                       param_types, ret_type, body)

    @typecheck(args=expr_any)
    def f(*args):
        indices, aggregations = unify_all(*args)
        return construct_expr(
            Apply(mname, ret_type, *(a._ir for a in args),
                  type_args=type_args), ret_type, indices, aggregations)

    return Function(f, param_types, ret_type, mname, type_args)
示例#19
0
def generate_ld_scores_from_ld_matrix(pop_data,
                                      data_type,
                                      min_frequency=0.01,
                                      call_rate_cutoff=0.8,
                                      adj: bool = False,
                                      radius: int = 1000000,
                                      overwrite=False):
    # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR
    # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total)
    for label, pops in dict(pop_data).items():
        for pop, n in pops.items():
            if pop in ('nfe', 'fin', 'asj'): continue
            ht = hl.read_table(ld_index_path(data_type, pop, adj=adj))
            ht = ht.filter((ht.pop_freq.AF >= min_frequency)
                           & (ht.pop_freq.AF <= 1 - min_frequency)
                           & (ht.pop_freq.AN / n >= 2 *
                              call_rate_cutoff)).add_index(name='new_idx')

            indices = ht.idx.collect()

            r2 = BlockMatrix.read(
                ld_matrix_path(data_type,
                               pop,
                               min_frequency >= COMMON_FREQ,
                               adj=adj))
            r2 = r2.filter(indices, indices)**2
            r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

            starts_and_stops = hl.linalg.utils.locus_windows(ht.locus,
                                                             radius,
                                                             _localize=False)

            # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595
            # for the time being, until efficient BlockMatrix filtering gets an easier interface
            r2_adj = BlockMatrix._from_java(
                r2_adj._jbm.filterRowIntervalsIR(
                    Env.backend()._to_java_ir(starts_and_stops._ir), False))

            l2row = r2_adj.sum(axis=0).T
            l2col = r2_adj.sum(axis=1)
            l2 = l2row + l2col + 1

            l2_bm_tmp = new_temp_file()
            l2_tsv_tmp = new_temp_file()
            l2.write(l2_bm_tmp, force_row_major=True)
            BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

            ht_scores = hl.import_table(l2_tsv_tmp,
                                        no_header=True,
                                        impute=True)
            ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'})
            ht_scores = ht_scores.key_by('idx')

            ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals()
            ht.filter(hl.is_defined(ht.ld_score)).write(
                ld_scores_path(data_type, pop, adj), overwrite)
示例#20
0
def write_variant_datasets(vdss,
                           paths,
                           *,
                           overwrite=False,
                           stage_locally=False,
                           codec_spec=None):
    """Write many `vdses` to their corresponding path in `paths`."""
    ref_writer = ir.MatrixNativeMultiWriter(
        [f"{p}/reference_data" for p in paths], overwrite, stage_locally,
        codec_spec)
    var_writer = ir.MatrixNativeMultiWriter(
        [f"{p}/variant_data" for p in paths], overwrite, stage_locally,
        codec_spec)
    Env.backend().execute(
        ir.MatrixMultiWrite([vds.reference_data._mir for vds in vdss],
                            ref_writer))
    Env.backend().execute(
        ir.MatrixMultiWrite([vds.variant_data._mir for vds in vdss],
                            var_writer))
示例#21
0
文件: context.py 项目: saponas/hail
def debug_info():
    from hail.backend.spark_backend import SparkBackend
    hail_jar_path = None
    if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
        hail_jar_path = pkg_resources.resource_filename(__name__, "hail-all-spark.jar")
    spark_conf = None
    if isinstance(Env.backend(), SparkBackend):
        spark_conf = spark_context()._conf.getAll()
    return {
        'spark_conf': spark_conf,
        'hail_jar_path': hail_jar_path,
        'version': version()
    }
def _eval_many(*expressions, timed=False, name='_eval_many'):
    from hail.utils.java import Env

    irs = []
    for expression in expressions:
        analyze(name, expression, Indices(expression._indices.source))
        if expression._indices.source is None:
            ir_type = expression._ir.typ
            expression_type = expression.dtype
            if ir_type != expression.dtype:
                raise ExpressionException(
                    f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}'
                )
            irs.append(expression._ir)
        else:
            uid = Env.get_uid()
            ir = expression._indices.source.select_globals(**{
                uid: expression
            }).index_globals()[uid]._ir
            irs.append(ir)

    return Env.backend().execute_many(*irs, timed=timed)
示例#23
0
 def _compute_type(self):
     self._type = Env.backend().blockmatrix_type(self)
示例#24
0
文件: table_ir.py 项目: jigold/hail
 def _compute_type(self):
     self._type = Env.backend().table_type(self)