class ExportType: CONCATENATED = "concatenated" PARALLEL_SEPARATE_HEADER = "separate_header" PARALLEL_HEADER_IN_SHARD = "header_per_shard" checker = enumeration("concatenated", "separate_header", "header_per_shard") @staticmethod def default(export_type): if export_type is None: return ExportType.CONCATENATED else: return export_type
matrix_table_source) from hail.expr.types import tarray from hail import ir from hail.linalg import BlockMatrix from hail.table import Table from hail.typecheck import typecheck, nullable, numeric, enumeration from ..pca import hwe_normalized_pca @typecheck(call_expr=expr_call, min_individual_maf=numeric, k=nullable(int), scores_expr=nullable(expr_array(expr_float64)), min_kinship=nullable(numeric), statistics=enumeration('kin', 'kin2', 'kin20', 'all'), block_size=nullable(int), include_self_kinship=bool) def pc_relate(call_expr, min_individual_maf, *, k=None, scores_expr=None, min_kinship=None, statistics="all", block_size=None, include_self_kinship=False) -> Table: r"""Compute relatedness estimates between individuals using a variant of the PC-Relate method. .. include:: ../_templates/req_diploid_gt.rst
Env._seed_generator = None hail.ir.clear_session_functions() ReferenceGenome._references = {} @typecheck(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=nullable(str), default_reference=enumeration(*BUILTIN_REFERENCES), idempotent=bool, global_seed=nullable(int), spark_conf=nullable(dictof(str, str)), skip_logging_configuration=bool, local_tmpdir=nullable(str), _optimizer_iterations=nullable(int)) def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=0, branching_factor=50,
def new_local_temp_dir(suffix=None, prefix=None, dir=None): local_temp_dir = tempfile.mkdtemp(suffix, prefix, dir) atexit.register(shutil.rmtree, local_temp_dir) return local_temp_dir def new_local_temp_file(filename="temp"): local_temp_dir = new_local_temp_dir() path = local_temp_dir + "/" + filename return path storage_level = enumeration('NONE', 'DISK_ONLY', 'DISK_ONLY_2', 'MEMORY_ONLY', 'MEMORY_ONLY_2', 'MEMORY_ONLY_SER', 'MEMORY_ONLY_SER_2', 'MEMORY_AND_DISK', 'MEMORY_AND_DISK_2', 'MEMORY_AND_DISK_SER', 'MEMORY_AND_DISK_SER_2', 'OFF_HEAP') def run_command(args): import subprocess as sp try: sp.check_output(args, stderr=sp.STDOUT) except sp.CalledProcessError as e: print(e.output) raise e def plural(orig, n, alternate=None): if n == 1: return orig
def upload_log(self): self._jhc.uploadLog() @typecheck(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=str, default_reference=enumeration('GRCh37', 'GRCh38'), idempotent=bool, global_seed=nullable(int), _backend=nullable(Backend)) def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir='/tmp', default_reference='GRCh37', idempotent=False,
@typecheck(bms=sequenceof(BlockMatrix), prefix=str, overwrite=bool) def block_matrices_tofiles(bms: List[BlockMatrix], prefix: str, overwrite: bool = False): writer = BlockMatrixBinaryMultiWriter(prefix, overwrite) Env.backend().execute( BlockMatrixMultiWrite([bm._bmir for bm in bms], writer)) @typecheck(bms=sequenceof(BlockMatrix), prefix=str, overwrite=bool, delimiter=str, header=nullable(str), add_index=bool, compression=nullable(enumeration('gz', 'bgz')), custom_filenames=nullable(sequenceof(str))) def export_block_matrices(bms: List[BlockMatrix], prefix: str, overwrite: bool = False, delimiter: str = '\t', header: Optional[str] = None, add_index: bool = False, compression: Optional[str] = None, custom_filenames=None): if custom_filenames: assert len(custom_filenames) == len( bms ), "Number of block matrices and number of custom filenames must be equal"
from hail.utils.java import Env from hail.typecheck import typecheck, enumeration from typing import Dict, List @typecheck(path=str, mode=enumeration('r', 'w', 'x', 'rb', 'wb', 'xb'), buffer_size=int) def hadoop_open(path: str, mode: str = 'r', buffer_size: int = 8192): """Open a file through the Hadoop filesystem API. Supports distributed file systems like hdfs, gs, and s3. Warning ------- Due to an implementation limitation, :func:`hadoop_open` may be quite slow for large data sets (anything larger than 50 MB). Examples -------- Write a Pandas DataFrame as a CSV directly into Google Cloud Storage: >>> with hadoop_open('gs://my-bucket/df.csv', 'w') as f: # doctest: +SKIP ... pandas_df.to_csv(f) Read and print the lines of a text file stored in Google Cloud Storage: >>> with hadoop_open('gs://my-bucket/notes.txt') as f: # doctest: +SKIP ... for line in f: ... print(line.strip()) Write two lines directly to a file in Google Cloud Storage:
Env._seed_generator = None hail.ir.clear_session_functions() ReferenceGenome._references = {} @typecheck(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=str, default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38', 'CanFam3'), idempotent=bool, global_seed=nullable(int), spark_conf=nullable(dictof(str, str)), skip_logging_configuration=bool, local_tmpdir=nullable(str), _optimizer_iterations=nullable(int)) def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=0, branching_factor=50,
Env._seed_generator = None def upload_log(self): self._jhc.uploadLog() @typecheck(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=str, default_reference=enumeration('GRCh37', 'GRCh38'), idempotent=bool, global_seed=nullable(int), _backend=nullable(Backend)) def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir='/tmp', default_reference='GRCh37', idempotent=False, global_seed=6348563392232659379, _backend=None): """Initialize Hail and Spark. Parameters ---------- sc : pyspark.SparkContext, optional Spark context. By default, a Spark context will be created. app_name : :obj:`str`
def new_local_temp_dir(suffix=None, prefix=None, dir=None): local_temp_dir = tempfile.mkdtemp(suffix, prefix, dir) atexit.register(shutil.rmtree, local_temp_dir) return local_temp_dir def new_local_temp_file(filename="temp"): local_temp_dir = new_local_temp_dir() path = local_temp_dir + "/" + filename return path storage_level = enumeration('NONE', 'DISK_ONLY', 'DISK_ONLY_2', 'MEMORY_ONLY', 'MEMORY_ONLY_2', 'MEMORY_ONLY_SER', 'MEMORY_ONLY_SER_2', 'MEMORY_AND_DISK', 'MEMORY_AND_DISK_2', 'MEMORY_AND_DISK_SER', 'MEMORY_AND_DISK_SER_2', 'OFF_HEAP') def run_command(args): import subprocess as sp try: sp.check_output(args, stderr=sp.STDOUT) except sp.CalledProcessError as e: print(e.output) raise e def plural(orig, n, alternate=None): if n == 1: return orig
from hail.utils.java import Env from hail.typecheck import typecheck, enumeration from typing import Dict, List @typecheck(path=str, mode=enumeration('r', 'w', 'x', 'rb', 'wb', 'xb'), buffer_size=int) def hadoop_open(path: str, mode: str = 'r', buffer_size: int = 8192): """Open a file through the Hadoop filesystem API. Supports distributed file systems like hdfs, gs, and s3. Warning ------- Due to an implementation limitation, :func:`hadoop_open` may be quite slow for large data sets (anything larger than 50 MB). Examples -------- Write a Pandas DataFrame as a CSV directly into Google Cloud Storage: >>> with hadoop_open('gs://my-bucket/df.csv', 'w') as f: # doctest: +SKIP ... pandas_df.to_csv(f) Read and print the lines of a text file stored in Google Cloud Storage: >>> with hadoop_open('gs://my-bucket/notes.txt') as f: # doctest: +SKIP ... for line in f: ... print(line.strip()) Write two lines directly to a file in Google Cloud Storage:
Returns ------- :class:`.VariantDataset`. """ if keep: variant_data = vds.variant_data.semi_join_rows(variants_table) else: variant_data = vds.variant_data.anti_join_rows(variants_table) return VariantDataset(vds.reference_data, variant_data) @typecheck(vds=VariantDataset, intervals=oneof(Table, expr_array(expr_interval(expr_any))), keep=bool, mode=enumeration('variants_only', 'split_at_boundaries', 'unchecked_filter_both')) def _parameterized_filter_intervals(vds: 'VariantDataset', intervals, keep: bool, mode: str) -> 'VariantDataset': intervals_table = None if isinstance(intervals, Table): expected = hl.tinterval(hl.tlocus(vds.reference_genome)) if len(intervals.key) != 1 or intervals.key[0].dtype != hl.tinterval( hl.tlocus(vds.reference_genome)): raise ValueError( f"'filter_intervals': expect a table with a single key of type {expected}; " f"found {list(intervals.key.dtype.values())}") intervals_table = intervals intervals = intervals.aggregate(hl.agg.collect(intervals.key[0])) if mode == 'variants_only': variant_data = hl.filter_intervals(vds.variant_data, intervals, keep)