def from_fasta_file(cls, name, fasta_file, index_file, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): """Create reference genome from a FASTA file. Parameters ---------- name: :obj:`str` Name for new reference genome. fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`str` Path to FASTA index file. Must be uncompressed. x_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) Returns ------- :class:`.ReferenceGenome` """ par_strings = ["{}:{}-{}".format(contig, start, end) for (contig, start, end) in par] Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings) rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True) rg._has_sequence = True return rg
def write_block_matrices(bms: List[BlockMatrix], path_prefix: str, overwrite: bool = False, force_row_major: bool = False, stage_locally: bool = False): """Writes a sequence of block matrices to disk in the same format as BlockMatrix.write. :param bms: :obj:`list` of :class:`BlockMatrix` Block matrices to write to disk. :param path_prefix: obj:`str` Prefix of path to write the block matrices to. :param overwrite: obj:`bool` If true, overwrite any files with the same name as the block matrices being generated. :param force_row_major: obj:`bool` If ``True``, transform blocks in column-major format to row-major format before writing. If ``False``, write blocks in their current format. :param stage_locally: :obj:`bool` If ``True``, major output will be written to temporary local storage before being copied to ``output``. """ writer = BlockMatrixNativeMultiWriter(path_prefix, overwrite, force_row_major, stage_locally) Env.backend().execute( BlockMatrixMultiWrite([bm._bmir for bm in bms], writer))
def __del__(self): try: Env.backend()._jhc.pyRemoveIrVector(self.jid) # there is only so much we can do if the attempt to remove the unused IR fails, # especially since this will often get called during interpreter shutdown. except Exception: pass
def eval_timed(expression): """Evaluate a Hail expression, returning the result and the times taken for each stage in the evaluation process. Parameters ---------- expression : :class:`.Expression` Any expression, or a Python value that can be implicitly interpreted as an expression. Returns ------- (Any, dict) Result of evaluating `expression` and a dictionary of the timings """ from hail.utils.java import Env analyze('eval_timed', expression, Indices(expression._indices.source)) if expression._indices.source is None: ir_type = expression._ir.typ expression_type = expression.dtype if ir_type != expression.dtype: raise ExpressionException( f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}' ) return Env.backend().execute(expression._ir, True) else: uid = Env.get_uid() ir = expression._indices.source.select_globals(**{ uid: expression }).index_globals()[uid]._ir return Env.backend().execute(ir, True)
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set()
def eval_timed(expression): """Evaluate a Hail expression, returning the result and the times taken for each stage in the evaluation process. Parameters ---------- expression : :class:`.Expression` Any expression, or a Python value that can be implicitly interpreted as an expression. Returns ------- (Any, dict) Result of evaluating `expression` and a dictionary of the timings """ from hail.utils.java import Env analyze('eval_timed', expression, Indices(expression._indices.source)) if expression._indices.source is None: ir_type = expression._ir.typ expression_type = expression.dtype if ir_type != expression.dtype: raise ExpressionException(f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}') return Env.backend().execute(expression._ir, True) else: uid = Env.get_uid() ir = expression._indices.source.select_globals(**{uid: expression}).index_globals()[uid]._ir return Env.backend().execute(ir, True)
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{ 'name': c, 'length': l } for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{ 'start': { 'contig': c, 'position': s }, 'end': { 'contig': c, 'position': e } } for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [ hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par ] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._sequence_files = None self._liftovers = dict()
def remove_sequence(self): """Remove the reference sequence. Returns ------- :obj:`bool` """ self._has_sequence = False Env.backend().remove_sequence(self.name)
def export_block_matrices(bms: List[BlockMatrix], prefix: str, overwrite: bool = False, delimiter: str = '\t', header: Optional[str] = None, add_index: bool = False): writer = BlockMatrixTextMultiWriter(prefix, overwrite, delimiter, header, add_index) Env.backend().execute( BlockMatrixMultiWrite([bm._bmir for bm in bms], writer))
def remove_liftover(self, dest_reference_genome): """Remove liftover to `dest_reference_genome`. Parameters ---------- dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` """ if dest_reference_genome.name in self._liftovers: self._liftovers.remove(dest_reference_genome.name) Env.backend().remove_liftover(self.name, dest_reference_genome.name)
def add_sequence(self, fasta_file, index_file=None): """Load the reference sequence from a FASTA file. Examples -------- Access the GRCh37 reference genome using :func:`.get_reference`: >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP Add a sequence file: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz', ... 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP Add a sequence file with the default index location: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_sequence` to test whether a sequence is loaded. FASTA and index files are hosted on google cloud for some of Hail's built-in references: **GRCh37** - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz`` - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai`` **GRCh38** - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz`` - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai`` Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`None` or :obj:`str` Path to FASTA index file. Must be uncompressed. If `None`, replace the fasta_file's extension with `fai`. """ if index_file is None: index_file = re.sub('\.[^.]*$', '.fai', fasta_file) Env.backend().add_sequence(self.name, fasta_file, index_file) self._has_sequence = True
def add_liftover(self, chain_file, dest_reference_genome): """Register a chain file for liftover. Examples -------- Access GRCh37 and GRCh38 using :func:`.get_reference`: >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP Add a chain file from 37 to 38: >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_liftover` to test whether a chain file has been registered. The chain file format is described `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__. Chain files are hosted on google cloud for some of Hail's built-in references: **GRCh37 to GRCh38** gs://hail-common/references/grch37_to_grch38.over.chain.gz **GRCh38 to GRCh37** gs://hail-common/references/grch38_to_grch37.over.chain.gz Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- chain_file : :obj:`str` Path to chain file. Can be compressed (GZIP) or uncompressed. dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` Reference genome to convert to. """ Env.backend().add_liftover(self.name, chain_file, dest_reference_genome.name) if dest_reference_genome.name in self._liftovers: raise KeyError( f"Liftover already exists from {self.name} to {dest_reference_genome.name}." ) self._liftovers[dest_reference_genome.name] = chain_file hl.ir.register_liftover_functions(self.name, dest_reference_genome.name)
def _compute_type(self): name = self.config['name'] if name == 'TableFilterPartitions' or name == 'TableFilterIntervals': self._type = self.child.typ else: assert name in ('VEP', 'Nirvana'), name self._type = Env.backend().table_type(self)
def eval_typed(expression): """Evaluate a Hail expression, returning the result and the type of the result. This method is extremely useful for learning about Hail expressions and understanding how to compose them. The expression must have no indices, but can refer to the globals of a :class:`.hail.Table` or :class:`.hail.MatrixTable`. Examples -------- Evaluate a conditional: >>> x = 6 >>> hl.eval_typed(hl.cond(x % 2 == 0, 'Even', 'Odd')) ('Even', dtype('str')) Parameters ---------- expression : :class:`.Expression` Any expression, or a Python value that can be implicitly interpreted as an expression. Returns ------- (any, :class:`.HailType`) Result of evaluating `expression`, and its type. """ analyze('eval_typed', expression, Indices(expression._indices.source)) if expression._indices.source is None: return (Env.backend().execute(expression._ir), expression.dtype) else: return expression.collect()[0], expression.dtype
def compute_and_annotate_ld_score(ht, r2_adj, radius, out_name, overwrite): starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595 # for the time being, until efficient BlockMatrix filtering gets an easier interface # This is required, as the squaring/multiplication densifies, so this re-sparsifies. r2_adj = BlockMatrix._from_java( r2_adj._jbm.filterRowIntervalsIR( Env.backend()._to_java_ir(starts_and_stops._ir), False)) l2row = r2_adj.sum(axis=0).T l2col = r2_adj.sum(axis=1) l2 = l2row + l2col + 1 l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals() ht.filter(hl.is_defined(ht.ld_score)).write(out_name, overwrite)
def spark_context(): """Returns the active Spark context. Returns ------- :class:`pyspark.SparkContext` """ return Env.backend().sc
def add_liftover(self, chain_file, dest_reference_genome): """Register a chain file for liftover. Examples -------- Access GRCh37 and GRCh38 using :func:`.get_reference`: >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP Add a chain file from 37 to 38: >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_liftover` to test whether a chain file has been registered. The chain file format is described `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__. Chain files are hosted on google cloud for some of Hail's built-in references: **GRCh37 to GRCh38** gs://hail-common/references/grch37_to_grch38.over.chain.gz **GRCh38 to GRCh37** gs://hail-common/references/grch38_to_grch37.over.chain.gz Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- chain_file : :obj:`str` Path to chain file. Can be compressed (GZIP) or uncompressed. dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` Reference genome to convert to. """ Env.backend().add_liftover(self.name, chain_file, dest_reference_genome.name) self._liftovers.add(dest_reference_genome.name) hl.ir.register_liftover_functions(self.name, dest_reference_genome.name)
def define_function(f, *param_types, _name=None, type_args=()): mname = _name if _name is not None else Env.get_uid() param_names = [Env.get_uid(mname) for _ in param_types] body = f(*(construct_expr(Ref(pn), pt) for pn, pt in zip(param_names, param_types))) ret_type = body.dtype Env.backend().register_ir_function(mname, type_args, param_names, param_types, ret_type, body) @typecheck(args=expr_any) def f(*args): indices, aggregations = unify_all(*args) return construct_expr( Apply(mname, ret_type, *(a._ir for a in args), type_args=type_args), ret_type, indices, aggregations) return Function(f, param_types, ret_type, mname, type_args)
def generate_ld_scores_from_ld_matrix(pop_data, data_type, min_frequency=0.01, call_rate_cutoff=0.8, adj: bool = False, radius: int = 1000000, overwrite=False): # This function required a decent number of high-mem machines (with an SSD for good measure) to complete the AFR # For the rest, on 20 n1-standard-8's, 1h15m to export block matrix, 15 mins to compute LD scores per population (~$150 total) for label, pops in dict(pop_data).items(): for pop, n in pops.items(): if pop in ('nfe', 'fin', 'asj'): continue ht = hl.read_table(ld_index_path(data_type, pop, adj=adj)) ht = ht.filter((ht.pop_freq.AF >= min_frequency) & (ht.pop_freq.AF <= 1 - min_frequency) & (ht.pop_freq.AN / n >= 2 * call_rate_cutoff)).add_index(name='new_idx') indices = ht.idx.collect() r2 = BlockMatrix.read( ld_matrix_path(data_type, pop, min_frequency >= COMMON_FREQ, adj=adj)) r2 = r2.filter(indices, indices)**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) # Lifted directly from https://github.com/hail-is/hail/blob/555e02d6c792263db2c3ed97db8002b489e2dacb/hail/python/hail/methods/statgen.py#L2595 # for the time being, until efficient BlockMatrix filtering gets an easier interface r2_adj = BlockMatrix._from_java( r2_adj._jbm.filterRowIntervalsIR( Env.backend()._to_java_ir(starts_and_stops._ir), False)) l2row = r2_adj.sum(axis=0).T l2col = r2_adj.sum(axis=1) l2 = l2row + l2col + 1 l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.annotate(**ht_scores[ht.new_idx]).select_globals() ht.filter(hl.is_defined(ht.ld_score)).write( ld_scores_path(data_type, pop, adj), overwrite)
def write_variant_datasets(vdss, paths, *, overwrite=False, stage_locally=False, codec_spec=None): """Write many `vdses` to their corresponding path in `paths`.""" ref_writer = ir.MatrixNativeMultiWriter( [f"{p}/reference_data" for p in paths], overwrite, stage_locally, codec_spec) var_writer = ir.MatrixNativeMultiWriter( [f"{p}/variant_data" for p in paths], overwrite, stage_locally, codec_spec) Env.backend().execute( ir.MatrixMultiWrite([vds.reference_data._mir for vds in vdss], ref_writer)) Env.backend().execute( ir.MatrixMultiWrite([vds.variant_data._mir for vds in vdss], var_writer))
def debug_info(): from hail.backend.spark_backend import SparkBackend hail_jar_path = None if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename(__name__, "hail-all-spark.jar") spark_conf = None if isinstance(Env.backend(), SparkBackend): spark_conf = spark_context()._conf.getAll() return { 'spark_conf': spark_conf, 'hail_jar_path': hail_jar_path, 'version': version() }
def _eval_many(*expressions, timed=False, name='_eval_many'): from hail.utils.java import Env irs = [] for expression in expressions: analyze(name, expression, Indices(expression._indices.source)) if expression._indices.source is None: ir_type = expression._ir.typ expression_type = expression.dtype if ir_type != expression.dtype: raise ExpressionException( f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}' ) irs.append(expression._ir) else: uid = Env.get_uid() ir = expression._indices.source.select_globals(**{ uid: expression }).index_globals()[uid]._ir irs.append(ir) return Env.backend().execute_many(*irs, timed=timed)
def _compute_type(self): self._type = Env.backend().blockmatrix_type(self)
def _compute_type(self): self._type = Env.backend().table_type(self)