def rename_duplicates(dataset, name='unique_id') -> MatrixTable: """Rename duplicate column keys. .. include:: ../_templates/req_tstring.rst Examples -------- >>> renamed = hl.rename_duplicates(dataset).cols() >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id) ... .select() ... .collect()) Notes ----- This method produces a new column field from the string column key by appending a unique suffix ``_N`` as necessary. For example, if the column key "NA12878" appears three times in the dataset, the first will produce "NA12878", the second will produce "NA12878_1", and the third will produce "NA12878_2". The name of this new field is parameterized by `name`. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. name : :obj:`str` Name of new field. Returns ------- :class:`.MatrixTable` """ require_col_key_str(dataset, 'rename_duplicates') ids = dataset.col_key[0].collect() uniques = set() mapping = [] new_ids = [] fmt = lambda s, i: '{}_{}'.format(s, i) for s in ids: s_ = s i = 0 while s_ in uniques: i += 1 s_ = fmt(s, i) if s_ != s: mapping.append((s, s_)) uniques.add(s_) new_ids.append(s_) if mapping: info(f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:' + ''.join(f'\n "{pre}" => "{post}"' for pre, post in mapping)) else: info('No duplicate sample IDs found.') uid = Env.get_uid() return dataset.annotate_cols(**{name: hl.literal(new_ids)[hl.int(hl.scan.count())]})
def hwe_normalize(call_expr): mt = matrix_table_source('hwe_normalize/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "hwe_normalize: found 0 variants after filtering out monomorphic sites." ) info( f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.unfilter_entries() normalized_gt = hl.or_else( (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) return normalized_gt
def plan(self, n_inputs: int) -> CombinerPlan: assert n_inputs > 0 def int_ceil(x): return int(math.ceil(x)) tree_height = int_ceil(math.log(n_inputs, self.branch_factor)) phases: List[Phase] = [] file_size: List[List[int]] = [] # List of file size per phase file_size.append([1 for _ in range(n_inputs)]) while len(file_size[-1]) > 1: last_stage_files = file_size[-1] n = len(last_stage_files) i = 0 jobs = [] while (i < n): job = [] job_i = 0 while job_i < self.batch_size and i < n: merge = [] merge_i = 0 merge_size = 0 while merge_i < self.branch_factor and i < n: merge_size += last_stage_files[i] merge.append(i) merge_i += 1 i += 1 job.append(Merge(merge, merge_size)) job_i += 1 jobs.append(Job(job)) file_size.append([ merge.input_total_size for job in jobs for merge in job.merges ]) phases.append(Phase(jobs)) assert len(phases) == tree_height for layer in file_size: assert sum(layer) == n_inputs phase_strs = [] total_jobs = 0 for i, phase in enumerate(phases): n = len(phase.jobs) job_str = hl.utils.misc.plural('job', n) n_files_produced = len(file_size[i + 1]) adjective = 'final' if n_files_produced == 1 else 'intermediate' file_str = hl.utils.misc.plural('file', n_files_produced) phase_strs.append( f'\n Phase {i + 1}: {n} {job_str} corresponding to {n_files_produced} {adjective} output {file_str}.' ) total_jobs += n info( f"GVCF combiner plan:\n" f" Branch factor: {self.branch_factor}\n" f" Batch size: {self.batch_size}\n" f" Combining {n_inputs} input files in {tree_height} phases with {total_jobs} total jobs.{''.join(phase_strs)}\n" ) return CombinerPlan(file_size, phases)
def hailBlanczos(A, G, k, q, compute_U=False): G_i = hl.nd.qr(G)[0] g_list = [G_i] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j+1}/{q}") G_i = A.aggregate(hl.agg.ndarray_sum( A.ndarray.T @ (A.ndarray @ G_i)), _localize=False) G_i = hl.nd.qr(G_i)[0]._persist() g_list.append(G_i) info("blanczos_pca: Iterations complete. Computing local QR") G = hl.nd.hstack(g_list) V = hl.nd.qr(G)[0]._persist() AV = A.select(ndarray=A.ndarray @ V) if compute_U: AV_local = hl.nd.vstack(AV.aggregate(hl.agg.collect(AV.ndarray))) U, R = hl.nd.qr(AV_local)._persist() return U, R, V else: Rs = hl.nd.vstack( AV.aggregate(hl.agg.collect(hl.nd.qr(AV.ndarray)[1]))) R = hl.nd.qr(Rs)[1]._persist() return R, V
def _step_vdses(self): current_bin = original_bin = min(self.vdses) files_to_merge = self.vdses[current_bin][:self.branch_factor] if len(files_to_merge) == len(self.vdses[current_bin]): del self.vdses[current_bin] else: self.vdses[current_bin] = self.vdses[current_bin][self. branch_factor:] remaining = self.branch_factor - len(files_to_merge) while self._num_vdses > 0 and remaining > 0: current_bin = min(self.vdses) extra = self.vdses[current_bin][-remaining:] if len(extra) == len(self.vdses[current_bin]): del self.vdses[current_bin] else: self.vdses[current_bin] = self.vdses[current_bin][:-remaining] files_to_merge = extra + files_to_merge remaining = self.branch_factor - len(files_to_merge) new_n_samples = sum(f.n_samples for f in files_to_merge) info( f'VDS Combine (job {self._job_id}): merging {len(files_to_merge)} datasets with {new_n_samples} samples' ) temp_path = self._temp_out_path(f'vds-combine_job{self._job_id}') largest_vds = max(files_to_merge, key=lambda vds: vds.n_samples) vds = hl.vds.read_vds(largest_vds.path) interval_bin = floor(log(new_n_samples, self.branch_factor)) intervals, intervals_dtype = self.__intervals_cache.get( interval_bin, (None, None)) if intervals is None: # we use the reference data since it generally has more rows than the variant data intervals, intervals_dtype = calculate_new_intervals( vds.reference_data, self.target_records, os.path.join(temp_path, 'interval_checkpoint.ht')) self.__intervals_cache[interval_bin] = (intervals, intervals_dtype) paths = [f.path for f in files_to_merge] vdss = read_variant_datasets(paths, intervals, intervals_dtype) combined = combine_variant_datasets(vdss) if self.finished: combined.write(self.output_path) return new_path = os.path.join(temp_path, 'dataset.vds') combined.write(new_path, overwrite=True) new_bin = floor(log(new_n_samples, self.branch_factor)) # this ensures that we don't somehow stick a vds at the end of # the same bin, ending up with a weird ordering issue if new_bin <= original_bin: new_bin = original_bin + 1 self.vdses[new_bin].append( VDSMetadata(path=new_path, n_samples=new_n_samples))
def _step_gvcfs(self): step = self.branch_factor files_to_merge = self.gvcfs[:self.gvcf_batch_size * step] self.gvcfs = self.gvcfs[self.gvcf_batch_size * step:] info( f'GVCF combine (job {self._job_id}): merging {len(files_to_merge)} GVCFs into ' f'{(len(files_to_merge) + step - 1) // step} datasets') if self.gvcf_external_header is not None: sample_names = self.gvcf_sample_names[:self.gvcf_batch_size * step] self.gvcf_sample_names = self.gvcf_sample_names[self. gvcf_batch_size * step:] else: sample_names = None merge_vds = [] merge_n_samples = [] vcfs = [ transform_gvcf(vcf, reference_entry_fields_to_keep=self. gvcf_reference_entry_fields_to_keep, info_to_keep=self.gvcf_info_to_keep) for vcf in hl.import_gvcfs( files_to_merge, self.gvcf_import_intervals, array_elements_required=False, _external_header=self.gvcf_external_header, _external_sample_ids=[[name] for name in sample_names] if sample_names is not None else None, reference_genome=self.reference_genome, contig_recoding=self.contig_recoding) ] while vcfs: merging, vcfs = vcfs[:step], vcfs[step:] merge_vds.append(combine_variant_datasets(merging)) merge_n_samples.append(len(merging)) if self.finished and len(merge_vds) == 1: merge_vds[0].write(self.output_path) return temp_path = self._temp_out_path( f'gvcf-combine_job{self._job_id}/dataset_') pad = len(str(len(merge_vds) - 1)) merge_metadata = [ VDSMetadata(path=temp_path + str(count).rjust(pad, '0') + '.vds', n_samples=n_samples) for count, n_samples in enumerate(merge_n_samples) ] paths = [md.path for md in merge_metadata] hl.vds.write_variant_datasets(merge_vds, paths, overwrite=True, codec_spec=FAST_CODEC_SPEC) for md in merge_metadata: self.vdses[max(1, floor(log(md.n_samples, self.branch_factor)))].append(md)
def value_type(): code = flask.request.json info(f'value type: {code}') try: jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) result = jir.typ().toString() info(f'result: {result}') return flask.jsonify(result) except FatalError as e: return flask.jsonify({'message': e.args[0]}), 400
def copy_log(self, path: str) -> None: log = Env.hc()._log try: if self.is_dir(path): _, tail = os.path.split(log) path = os.path.join(path, tail) info(f"copying log to {repr(path)}...") self.copy(local_path_uri(Env.hc()._log), path) except Exception as e: sys.stderr.write(f'Could not copy log: encountered error:\n {e}')
def value_type(): code = flask.request.json info(f'value type: {code}') jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) result = jir.typ().toString() info(f'result: {result}') return flask.jsonify(result)
async def value_type(request): code = await request.json() info(f'value type: {code}') try: result = await run(blocking_value_type, code) info(f'result: {result}') return web.json_response(result) except FatalError as e: return web.json_response({ 'message': e.args[0] }, status=400)
async def table_type(request, userdata): code = await request.json() info(f'table type: {code}') try: result = await run(blocking_table_type, code) info(f'result: {result}') return web.json_response(result) except FatalError as e: return web.json_response({ 'message': e.args[0] }, status=400)
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable. On Google Cloud platform: Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz Example ------- >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37')) # doctest: +SKIP Parameters ---------- gene_symbols : :obj:`list` of :obj:`str`, optional Gene symbols (e.g. PCSK9). gene_ids : :obj:`list` of :obj:`str`, optional Gene IDs (e.g. ENSG00000223972). transcript_ids : :obj:`list` of :obj:`str`, optional Transcript IDs (e.g. ENSG00000223972). verbose : :obj:`bool` If ``True``, print which genes and transcripts were matched in the GTF file. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use (passed along to import_gtf). gtf_file : :obj:`str` GTF file to load. If none is provided, but `reference_genome` is one of `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform). Returns ------- :obj:`list` of :class:`.Interval` """ if gene_symbols is None and gene_ids is None and transcript_ids is None: raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids') ht = _load_gencode_gtf(gtf_file, reference_genome) criteria = [] if gene_symbols: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols)) if gene_ids: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids)) if transcript_ids: criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids)) ht = ht.filter(functools.reduce(operator.ior, criteria)) gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval))) if verbose: info(f'get_gene_intervals found {len(gene_info)} entries:\n' + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info))) intervals = list(map(lambda x: x[-1], gene_info)) return intervals
def execute(): code = flask.request.json info(f'execute: {code}') try: jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) typ = hl.dtype(jir.typ().toString()) value = Env.hail().backend.spark.SparkBackend.executeJSON(jir) result = {'type': str(typ), 'value': value} info(f'result: {result}') return flask.jsonify(result) except FatalError as e: return flask.jsonify({'message': e.args[0]}), 400
def table_type(): code = flask.request.json info(f'table type: {code}') jir = Env.hail().expr.ir.IRParser.parse_table_ir(code, {}, {}) ttyp = hl.ttable._from_java(jir.typ()) result = {'global': str(ttyp.global_type), 'row': str(ttyp.row_type), 'row_key': ttyp.row_key} info(f'result: {result}') return flask.jsonify(result)
def execute(): code = flask.request.json info(f'execute: {code}') jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) typ = hl.dtype(jir.typ().toString()) value = Env.hail().expr.ir.Interpret.interpretJSON(jir) result = {'type': str(typ), 'value': value} info(f'result: {result}') return flask.jsonify(result)
def table_type(): code = flask.request.json info(f'table type: {code}') try: jir = Env.hail().expr.ir.IRParser.parse_table_ir(code, {}, {}) ttyp = hl.ttable._from_java(jir.typ()) result = { 'global': str(ttyp.global_type), 'row': str(ttyp.row_type), 'row_key': ttyp.row_key } info(f'result: {result}') return flask.jsonify(result) except FatalError as e: return flask.jsonify({'message': e.args[0]}), 400
def blockmatrix_type(): code = flask.request.json info(f'blockmatrix type: {code}') try: jir = Env.hail().expr.ir.IRParser.parse_blockmatrix_ir(code, {}, {}) bmtyp = hl.tblockmatrix._from_java(jir.typ()) result = { 'element_type': str(bmtyp.element_type), 'shape': bmtyp.shape, 'is_row_vector': bmtyp.is_row_vector, 'block_size': bmtyp.block_size } info(f'result: {result}') return flask.jsonify(result) except FatalError as e: return flask.jsonify({'message': e.args[0]}), 400
def matrix_type(): code = flask.request.json info(f'matrix type: {code}') try: jir = Env.hail().expr.ir.IRParser.parse_matrix_ir(code, {}, {}) mtyp = hl.tmatrix._from_java(jir.typ()) result = { 'global': str(mtyp.global_type), 'col': str(mtyp.col_type), 'col_key': mtyp.col_key, 'row': str(mtyp.row_type), 'row_key': mtyp.row_key, 'entry': str(mtyp.entry_type) } info(f'result: {result}') return flask.jsonify(result) except FatalError as e: return flask.jsonify({'message': e.args[0]}), 400
def execute(): code = flask.request.json info(f'execute: {code}') jir = Env.hail().expr.Parser.parse_value_ir(code, {}, {}) typ = hl.HailType._from_java(jir.typ()) value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {}) result = { 'type': str(typ), 'value': value } info(f'result: {result}') return flask.jsonify(result)
def execute(): code = flask.request.json info(f'execute: {code}') jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) typ = hl.HailType._from_java(jir.typ()) value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {}) result = { 'type': str(typ), 'value': value } info(f'result: {result}') return flask.jsonify(result)
def run(self): flagname = 'no_ir_logging' prev_flag_value = hl._get_flags(flagname).get(flagname) hl._set_flags(**{flagname: '1'}) vds_samples = sum(vds.n_samples for vdses in self.vdses.values() for vds in vdses) info( 'Running VDS combiner:\n' f' VDS arguments: {self._num_vdses} datasets with {vds_samples} samples\n' f' GVCF arguments: {len(self.gvcfs)} inputs/samples\n' f' Branch factor: {self.branch_factor}\n' f' GVCF merge batch size: {self.gvcf_batch_size}') while not self.finished: self.save() self.step() self.save() info('Finished VDS combiner!') hl._set_flags(**{flagname: prev_flag_value})
def _reduced_svd(A: TallSkinnyMatrix, k=10, compute_U=False, iterations=2, iteration_size=None): # Set Parameters q = iterations if iteration_size is None: L = k + 2 else: L = iteration_size assert ((q + 1) * L >= k) n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_U) info("_reduced_svd: Computing local SVD") return fact.reduced_svd(k)
def _make_tsm_from_call(call_expr, block_size, mean_center=False, hwe_normalize=False): mt = matrix_table_source('_make_tsm/entry_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) if mean_center or hwe_normalize: mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined( mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "_make_tsm: found 0 variants after filtering out monomorphic sites." ) info( f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.unfilter_entries() mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0)) if hwe_normalize: mt = mt.annotate_rows( __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev) else: mt = mt.select_entries(__x=mt.__gt) A, ht = mt_to_table_of_ndarray(mt.__x, block_size, return_checkpointed_table_also=True) A = A.persist() return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
def copy_log(path: str) -> None: """Attempt to copy the session log to a hadoop-API-compatible location. Examples -------- Specify a manual path: >>> hl.copy_log('gs://my-bucket/analysis-10-jan19.log') # doctest: +SKIP INFO: copying log to 'gs://my-bucket/analysis-10-jan19.log'... Copy to a directory: >>> hl.copy_log('gs://my-bucket/') # doctest: +SKIP INFO: copying log to 'gs://my-bucket/hail-20180924-2018-devel-46e5fad57524.log'... Notes ----- Since Hail cannot currently log directly to distributed file systems, this function is provided as a utility for offloading logs from ephemeral nodes. If `path` is a directory, then the log file will be copied using its base name to the directory (e.g. ``/home/hail.log`` would be copied as ``gs://my-bucket/hail.log`` if `path` is ``gs://my-bucket``. Parameters ---------- path: :obj:`str` """ log = Env.hc()._log try: if hadoop_is_dir(path): _, tail = os.path.split(log) path = os.path.join(path, tail) info(f"copying log to {repr(path)}...") hadoop_copy(local_path_uri(Env.hc()._log), path) except Exception as e: sys.stderr.write(f'Could not copy log: encountered error:\n {e}')
def copy_log(path: str) -> None: """Attempt to copy the session log to a hadoop-API-compatible location. Examples -------- Specify a manual path: >>> hl.copy_log('gs://my-bucket/analysis-10-jan19.log') # DOCTEST: +SKIP INFO: copying log to 'gs://my-bucket/analysis-10-jan19.log'... Copy to a directory: >>> hl.copy_log('gs://my-bucket/') # DOCTEST: +SKIP INFO: copying log to 'gs://my-bucket/hail-20180924-2018-devel-46e5fad57524.log'... Notes ----- Since Hail cannot currently log directly to distributed file systems, this function is provided as a utility for offloading logs from ephemeral nodes. If `path` is a directory, then the log file will be copied using its base name to the directory (e.g. ``/home/hail.log`` would be copied as ``gs://my-bucket/hail.log`` if `path` is ``gs://my-bucket``. Parameters ---------- path: :obj:`str` """ log = Env.hc()._log try: if hadoop_is_dir(path): _, tail = os.path.split(log) path = os.path.join(path, tail) info(f"copying log to {repr(path)}...") hadoop_copy(local_path_uri(Env.hc()._log), path) except Exception as e: sys.stderr.write(f'Could not copy log: encountered error:\n {e}')
def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW
def multi_way_union_mts(mts: list, tmp_dir: str, chunk_size: int) -> hl.MatrixTable: """Joins MatrixTables in the provided list :param list mts: list of MatrixTables to join together :param str tmp_dir: path to temporary directory for intermediate results :param int chunk_size: number of MatrixTables to join per chunk :return: joined MatrixTable :rtype: MatrixTable """ staging = [mt.localize_entries("__entries", "__cols") for mt in mts] stage = 0 while len(staging) > 1: n_jobs = int(math.ceil(len(staging) / chunk_size)) info(f"multi_way_union_mts: stage {stage}: {n_jobs} total jobs") next_stage = [] for i in range(n_jobs): to_merge = staging[chunk_size * i:chunk_size * (i + 1)] info( f"multi_way_union_mts: stage {stage} / job {i}: merging {len(to_merge)} inputs" ) merged = hl.Table.multi_way_zip_join(to_merge, "__entries", "__cols") merged = merged.annotate(__entries=hl.flatten( hl.range(hl.len(merged.__entries)).map(lambda i: hl.coalesce( merged.__entries[i].__entries, hl.range(hl.len(merged.__cols[i].__cols)).map( lambda j: hl.null(merged.__entries.__entries.dtype. element_type.element_type)), )))) merged = merged.annotate_globals( __cols=hl.flatten(merged.__cols.map(lambda x: x.__cols))) next_stage.append( merged.checkpoint(os.path.join(tmp_dir, f"stage_{stage}_job_{i}.ht"), overwrite=True)) info(f"done stage {stage}") stage += 1 staging.clear() staging.extend(next_stage) return (staging[0]._unlocalize_entries( "__entries", "__cols", list(mts[0].col_key)).unfilter_entries())
def from_random_effects(cls, y, x, z, p_path=None, overwrite=False, max_condition_number=1e-10, complexity_bound=8192): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> z = np.array([[0.0, 0.0, 1.0], ... [0.0, 1.0, 2.0], ... [1.0, 2.0, 4.0], ... [2.0, 4.0, 8.0]]) >>> model, p = LinearMixedModel.from_random_effects(y, x, z) >>> model.fit() >>> model.h_sq 0.38205307244271675 Notes ----- If :math:`n \leq m`, the returned model is full rank. If :math:`n > m`, the returned model is low rank. In this case only, eigenvalues less than or equal to `max_condition_number` times the top eigenvalue are dropped from :math:`S`, with the corresponding eigenvectors dropped from :math:`P`. This guards against precision loss on left eigenvectors computed via the right gramian :math:`Z^T Z` in :meth:`BlockMatrix.svd`. In either case, one can truncate to a rank :math:`r` model as follows. If `p` is an ndarray: >>> p_r = p[:r, :] # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) # doctest: +SKIP If `p` is a block matrix: >>> p[:r, :].write(p_r_path) # doctest: +SKIP >>> p_r = BlockMatrix.read(p_r_path) # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path) # doctest: +SKIP This method applies no standardization to `z`. Warning ------- If `z` is a block matrix, then ideally `z` should be the result of directly reading from disk (and possibly a transpose). This is most critical if :math:`n > m`, because in this case multiplication by `z` will result in all preceding transformations being repeated ``n / block_size`` times, as explained in :class:`.BlockMatrix`. At least one dimension must be less than or equal to 46300. See the warning in :meth:`.BlockMatrix.svd` for performance considerations. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations :math:`y`. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects :math:`X`. z: :class:`ndarray` or :class:`BlockMatrix` :math:`n \times m` matrix of random effects :math:`Z`. p_path: :obj:`str`, optional Path at which to write :math:`P` as a block matrix. Required if `z` is a block matrix. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. max_condition_number: :obj:`float` Maximum condition number. Must be greater than 1e-16. complexity_bound: :obj:`int` Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block matrix. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`Z`. p: :class:`ndarray` or :class:`.BlockMatrix` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. The type is block matrix if `z` is a block matrix and :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix. """ z_is_bm = isinstance(z, BlockMatrix) if z_is_bm and p_path is None: raise ValueError("from_random_effects: 'p_path' required when 'z'" "is a block matrix.") if max_condition_number < 1e-16: raise ValueError("from_random_effects: 'max_condition_number' must " f"be at least 1e-16, found {max_condition_number}") _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(z, "z", 2) n, m = z.shape if y.shape[0] != n: raise ValueError("from_random_effects: 'y' and 'z' must have the " "same number of rows") if x.shape[0] != n: raise ValueError("from_random_effects: 'x' and 'z' must have the " "same number of rows") if z_is_bm: u, s0, _ = z.svd(complexity_bound=complexity_bound) p = u.T p_is_bm = isinstance(p, BlockMatrix) else: u, s0, _ = hl.linalg._svd(z, full_matrices=False) p = u.T p_is_bm = False s = s0 ** 2 low_rank = n > m if low_rank: assert np.all(np.isfinite(s)) r = np.searchsorted(-s, -max_condition_number * s[0]) if r < m: info(f'from_random_effects: model rank reduced from {m} to {r} ' f'due to ill-condition.' f'\n Largest dropped eigenvalue was {s[r]}.') s = s[:r] p = p[:r, :] if p_path is not None: if p_is_bm: p.write(p_path, overwrite=overwrite) p = BlockMatrix.read(p_path) else: BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite) if p_is_bm: py, px = (p @ y).to_numpy(), (p @ x).to_numpy() else: py, px = p @ y, p @ x if low_rank: model = LinearMixedModel(py, px, s, y, x, p_path) else: model = LinearMixedModel(py, px, s, p_path=p_path) return model, p
def concordance( left, right, *, _localize_global_statistics=True ) -> Tuple[List[List[int]], Table, Table]: """Calculate call concordance with another dataset. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst .. include:: ../_templates/req_unphased_diploid_gt.rst Examples -------- Compute concordance between two datasets and output the global concordance statistics and two tables with concordance computed per column key and per row key: >>> global_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset2) Notes ----- This method computes the genotype call concordance (from the entry field **GT**) between two biallelic variant datasets. It requires unique sample IDs and performs an inner join on samples (only samples in both datasets will be considered). In addition, all genotype calls must be **diploid** and **unphased**. It performs an ordered zip join of the variants. That means the variants of each dataset are sorted, with duplicate variants appearing in some random relative order, and then zipped together. When a variant appears a different number of times between the two datasets, the dataset with the fewer number of instances is padded with "no data". For example, if a variant is only in one dataset, then each genotype is treated as "no data" in the other. This method returns a tuple of three objects: a nested list of list of int with global concordance summary statistics, a table with concordance statistics per column key, and a table with concordance statistics per row key. **Using the global summary result** The global summary is a list of list of int (conceptually a 5 by 5 matrix), where the indices have special meaning: 0. No Data (missing variant) 1. No Call (missing genotype call) 2. Hom Ref 3. Heterozygous 4. Hom Var The first index is the state in the left dataset and the second index is the state in the right dataset. Typical uses of the summary list are shown below. >>> summary, samples, variants = hl.concordance(dataset, dataset2) >>> left_homref_right_homvar = summary[2][4] >>> left_het_right_missing = summary[3][1] >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3] >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4] >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant **Using the table results** Table 1: Concordance statistics by column This table contains the column key field of `left`, and the following fields: - `n_discordant` (:py:data:`.tint64`) -- Count of discordant calls (see below for full definition). - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. Table 2: Concordance statistics by row This table contains the row key fields of `left`, and the following fields: - `n_discordant` (:py:data:`.tfloat64`) -- Count of discordant calls (see below for full definition). - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. In these tables, the column **n_discordant** is provided as a convenience, because this is often one of the most useful concordance statistics. This value is the number of genotypes which were called (homozygous reference, heterozygous, or homozygous variant) in both datasets, but where the call did not match between the two. The column `concordance` matches the structure of the global summmary, which is detailed above. Once again, the first index into this array is the state on the left, and the second index is the state on the right. For example, ``concordance[1][4]`` is the number of "no call" genotypes on the left that were called homozygous variant on the right. Parameters ---------- left : :class:`.MatrixTable` First dataset to compare. right : :class:`.MatrixTable` Second dataset to compare. Returns ------- (list of list of int, :class:`.Table`, :class:`.Table`) The global concordance statistics, a table with concordance statistics per column key, and a table with concordance statistics per row key. """ require_col_key_str(left, 'concordance, left') require_col_key_str(right, 'concordance, right') left_sample_counter = left.aggregate_cols(hl.agg.counter(left.col_key[0])) right_sample_counter = right.aggregate_cols( hl.agg.counter(right.col_key[0])) left_bad = [f'{k!r}: {v}' for k, v in left_sample_counter.items() if v > 1] right_bad = [ f'{k!r}: {v}' for k, v in right_sample_counter.items() if v > 1 ] if left_bad or right_bad: raise ValueError(f"Found duplicate sample IDs:\n" f" left: {', '.join(left_bad)}\n" f" right: {', '.join(right_bad)}") included = set(left_sample_counter.keys()).intersection( set(right_sample_counter.keys())) info( f"concordance: including {len(included)} shared samples " f"({len(left_sample_counter)} total on left, {len(right_sample_counter)} total on right)" ) left = require_biallelic(left, 'concordance, left') right = require_biallelic(right, 'concordance, right') lit = hl.literal(included, dtype=hl.tset(hl.tstr)) left = left.filter_cols(lit.contains(left.col_key[0])) right = right.filter_cols(lit.contains(right.col_key[0])) left = left.select_entries('GT').select_rows().select_cols() right = right.select_entries('GT').select_rows().select_cols() joined = hl.experimental.full_outer_join_mt(left, right) def get_idx(struct): return hl.cond(hl.is_missing(struct), 0, hl.coalesce(2 + struct.GT.n_alt_alleles(), 1)) aggr = hl.agg.counter( get_idx(joined.left_entry) + 5 * get_idx(joined.right_entry)) def concordance_array(counter): return hl.range(0, 5).map( lambda i: hl.range(0, 5).map(lambda j: counter.get(i + 5 * j, 0))) def n_discordant(counter): return hl.sum( hl.array(counter).filter(lambda tup: ~hl.literal( {i**2 for i in range(5)}).contains(tup[0])).map(lambda tup: tup[1])) glob = joined.aggregate_entries(concordance_array(aggr), _localize=_localize_global_statistics) if _localize_global_statistics: total_conc = [x[1:] for x in glob[1:]] on_diag = sum(total_conc[i][i] for i in range(len(total_conc))) total_obs = sum(sum(x) for x in total_conc) info(f"concordance: total concordance {on_diag/total_obs * 100:.2f}%") per_variant = joined.annotate_rows(concordance=aggr) per_variant = per_variant.annotate_rows( concordance=concordance_array(per_variant.concordance), n_discordant=n_discordant(per_variant.concordance)) per_sample = joined.annotate_cols(concordance=aggr) per_sample = per_sample.annotate_cols( concordance=concordance_array(per_sample.concordance), n_discordant=n_discordant(per_sample.concordance)) return glob, per_sample.cols(), per_variant.rows()
def _blanczos_pca(entry_expr, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() # Set Parameters q = q_iterations L = k + oversampling_param n = A.take(1)[0].ndarray.shape[1] # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW U, S, V = hailBlanczos(A, G, k, q) scores = V.transpose() * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key)) lt = ht.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) if compute_loadings: return eigens, st, lt else: return eigens, st, None
def from_merged_representation(mt, *, ref_block_fields=(), infer_ref_block_fields: bool = True): """Create a VariantDataset from a sparse MatrixTable containing variant and reference data.""" if 'END' not in mt.entry: raise ValueError( "VariantDataset.from_merged_representation: expect field 'END' in matrix table entry" ) if 'LA' not in mt.entry: raise ValueError( "VariantDataset.from_merged_representation: expect field 'LA' in matrix table entry" ) if 'GT' not in mt.entry and 'LGT' not in mt.entry: raise ValueError( "VariantDataset.from_merged_representation: expect field 'LGT' or 'GT' in matrix table entry" ) n_rows_to_use = 100 info( f"inferring reference block fields from missingness patterns in first {n_rows_to_use} rows" ) used_ref_block_fields = set(ref_block_fields) used_ref_block_fields.add('END') if infer_ref_block_fields: mt_head = mt.head(n_rows=n_rows_to_use) for k, any_present in zip( list(mt_head.entry), mt_head.aggregate_entries( hl.agg.filter( hl.is_defined(mt_head.END), tuple( hl.agg.any(hl.is_defined(mt_head[x])) for x in mt_head.entry)))): if any_present: used_ref_block_fields.add(k) gt_field = 'LGT' if 'LGT' in mt.entry else 'GT' # remove LGT/GT and LA fields, which are trivial for reference blocks and do not need to be represented if gt_field in used_ref_block_fields: used_ref_block_fields.remove(gt_field) if 'LA' in used_ref_block_fields: used_ref_block_fields.remove('LA') info("Including the following fields in reference block table:" + "".join(f"\n {k!r}" for k in mt.entry if k in used_ref_block_fields)) rmt = mt.filter_entries(hl.case( ).when(hl.is_missing(mt.END), False).when( hl.is_defined(mt.END) & mt[gt_field].is_hom_ref(), True).or_error( hl.str('cannot create VDS from merged representation -' ' found END field with non-reference genotype at ') + hl.str(mt.locus) + hl.str(' / ') + hl.str(mt.col_key[0]))) rmt = rmt.select_entries(*(x for x in rmt.entry if x in used_ref_block_fields)) rmt = rmt.filter_rows(hl.agg.count() > 0) # drop other alleles rmt = rmt.key_rows_by(rmt.locus) rmt = rmt.select_rows(ref_allele=rmt.alleles[0][0]) vmt = mt.filter_entries(hl.is_missing(mt.END)) vmt = vmt.filter_rows(hl.agg.count() > 0) return VariantDataset(rmt, vmt)
def run_combiner(sample_paths: List[str], out_file: str, tmp_path: str, intervals: Optional[List[hl.utils.Interval]] = None, header: Optional[str] = None, sample_names: Optional[List[str]] = None, branch_factor: int = CombinerConfig.default_branch_factor, batch_size: int = CombinerConfig.default_batch_size, target_records: int = CombinerConfig.default_target_records, overwrite: bool = False, reference_genome: str = 'default', contig_recoding: Optional[Dict[str, str]] = None, key_by_locus_and_alleles: bool = False): """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table. Parameters ---------- sample_paths : :obj:`list` of :obj:`str` Paths to individual GVCFs. out_file : :obj:`str` Path to final combined matrix table. tmp_path : :obj:`str` Path for intermediate output. intervals : list of :class:`.Interval` or None Partitioning with which to import GVCFs in first phase of combiner. header : :obj:`str` or None External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well. sample_names: list of :obj:`str` or None Sample names, to be used with `header`. branch_factor : :obj:`int` Combiner branch factor. batch_size : :obj:`int` Combiner batch size. target_records : :obj:`int` Target records per partition in each combiner phase after the first. overwrite : :obj:`bool` Overwrite output file, if it exists. reference_genome : :obj:`str` Reference genome for GVCF import. contig_recoding: :obj:`dict` of (:obj:`str`, :obj:`str`), optional Mapping from contig name in gVCFs to contig name the reference genome. All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. key_by_locus_and_alleles : :obj:`bool` Key by both locus and alleles in the final output. Returns ------- None """ tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' if header is not None: assert sample_names is not None assert len(sample_names) == len(sample_paths) # FIXME: this should be hl.default_reference().even_intervals_contig_boundary intervals = intervals or default_exome_intervals(reference_genome) config = CombinerConfig(branch_factor=branch_factor, batch_size=batch_size, target_records=target_records) plan = config.plan(len(sample_paths)) files_to_merge = sample_paths n_phases = len(plan.phases) total_ops = len(files_to_merge) * n_phases total_work_done = 0 for phase_i, phase in enumerate(plan.phases): phase_i += 1 # used for info messages, 1-indexed for readability n_jobs = len(phase.jobs) merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables' job_str = hl.utils.misc.plural('job', n_jobs) info( f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}." ) if phase_i > 1: intervals = calculate_new_intervals( hl.read_matrix_table(files_to_merge[0]).rows(), config.target_records, reference_genome=reference_genome) new_files_to_merge = [] for job_i, job in enumerate(phase.jobs): job_i += 1 # used for info messages, 1-indexed for readability n_merges = len(job.merges) merge_str = hl.utils.misc.plural('file', n_merges) pct_total = 100 * job.input_total_size / total_ops info( f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O." ) merge_mts: List[MatrixTable] = [] for merge in job.merges: inputs = [files_to_merge[i] for i in merge.inputs] if phase_i == 1: mts = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs( inputs, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=[ sample_names[i] for i in merge.inputs ] if header is not None else None, reference_genome=reference_genome, contig_recoding=contig_recoding) ] else: mts = [ hl.read_matrix_table(path, _intervals=intervals) for path in inputs ] merge_mts.append(combine_gvcfs(mts)) if phase_i == n_phases: # final merge! assert n_jobs == 1 assert len(merge_mts) == 1 [final_mt] = merge_mts if key_by_locus_and_alleles: final_mt = MatrixTable( MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'], is_sorted=True)) final_mt.write(out_file, overwrite=overwrite) new_files_to_merge = [out_file] info( f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished." ) break tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/' hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True) pad = len(str(len(merge_mts))) new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt' for n in range(len(merge_mts))) total_work_done += job.input_total_size info( f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished." ) info(f"Finished phase {phase_i}/{n_phases}.") files_to_merge = new_files_to_merge assert files_to_merge == [out_file] info("Finished!")
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable. On Google Cloud platform: Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz Example ------- >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37')) # doctest: +SKIP Parameters ---------- gene_symbols : :obj:`list` of :obj:`str`, optional Gene symbols (e.g. PCSK9). gene_ids : :obj:`list` of :obj:`str`, optional Gene IDs (e.g. ENSG00000223972). transcript_ids : :obj:`list` of :obj:`str`, optional Transcript IDs (e.g. ENSG00000223972). verbose : :obj:`bool` If ``True``, print which genes and transcripts were matched in the GTF file. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use (passed along to import_gtf). gtf_file : :obj:`str` GTF file to load. If none is provided, but `reference_genome` is one of `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform). Returns ------- :obj:`list` of :class:`.Interval` """ GTFS = { 'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz', 'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz', } if reference_genome is None: reference_genome = hl.default_reference().name if gtf_file is None: gtf_file = GTFS.get(reference_genome) if gtf_file is None: raise ValueError('get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)') if gene_symbols is None and gene_ids is None and transcript_ids is None: raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids') ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome, skip_invalid_contigs=True, min_partitions=12) ht = ht.annotate(gene_id=ht.gene_id.split(f'\\.')[0], transcript_id=ht.transcript_id.split('\\.')[0]) criteria = [] if gene_symbols: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols)) if gene_ids: criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids)) if transcript_ids: criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids)) ht = ht.filter(functools.reduce(operator.ior, criteria)) gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval))) if verbose: info(f'get_gene_intervals found {len(gene_info)} entries:\n' + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info))) intervals = list(map(lambda x: x[-1], gene_info)) return intervals
def from_random_effects(cls, y, x, z, p_path=None, overwrite=False, max_condition_number=1e-10, complexity_bound=8192): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> z = np.array([[0.0, 0.0, 1.0], ... [0.0, 1.0, 2.0], ... [1.0, 2.0, 4.0], ... [2.0, 4.0, 8.0]]) >>> model, p = LinearMixedModel.from_random_effects(y, x, z) >>> model.fit() >>> model.h_sq 0.38205307244271675 Notes ----- If :math:`n \leq m`, the returned model is full rank. If :math:`n > m`, the returned model is low rank. In this case only, eigenvalues less than or equal to `max_condition_number` times the top eigenvalue are dropped from :math:`S`, with the corresponding eigenvectors dropped from :math:`P`. This guards against precision loss on left eigenvectors computed via the right gramian :math:`Z^T Z` in :meth:`BlockMatrix.svd`. In either case, one can truncate to a rank :math:`r` model as follows. If `p` is an ndarray: >>> p_r = p[:r, :] # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) # doctest: +SKIP If `p` is a block matrix: >>> p[:r, :].write(p_r_path) # doctest: +SKIP >>> p_r = BlockMatrix.read(p_r_path) # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path) # doctest: +SKIP This method applies no standardization to `z`. Warning ------- If `z` is a block matrix, then ideally `z` should be the result of directly reading from disk (and possibly a transpose). This is most critical if :math:`n > m`, because in this case multiplication by `z` will result in all preceding transformations being repeated ``n / block_size`` times, as explained in :class:`.BlockMatrix`. At least one dimension must be less than or equal to 46300. See the warning in :meth:`.BlockMatrix.svd` for performance considerations. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations :math:`y`. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects :math:`X`. z: :class:`ndarray` or :class:`BlockMatrix` :math:`n \times m` matrix of random effects :math:`Z`. p_path: :obj:`str`, optional Path at which to write :math:`P` as a block matrix. Required if `z` is a block matrix. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. max_condition_number: :obj:`float` Maximum condition number. Must be greater than 1e-16. complexity_bound: :obj:`int` Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block matrix. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`Z`. p: :class:`ndarray` or :class:`.BlockMatrix` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. The type is block matrix if `z` is a block matrix and :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix. """ z_is_bm = isinstance(z, BlockMatrix) if z_is_bm and p_path is None: raise ValueError("from_random_effects: 'p_path' required when 'z'" "is a block matrix.") if max_condition_number < 1e-16: raise ValueError("from_random_effects: 'max_condition_number' must " f"be at least 1e-16, found {max_condition_number}") _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(z, "z", 2) n, m = z.shape if y.shape[0] != n: raise ValueError("from_random_effects: 'y' and 'z' must have the " "same number of rows") if x.shape[0] != n: raise ValueError("from_random_effects: 'x' and 'z' must have the " "same number of rows") if z_is_bm: u, s0, _ = z.svd(complexity_bound=complexity_bound) p = u.T p_is_bm = isinstance(p, BlockMatrix) else: u, s0, _ = hl.linalg._svd(z, full_matrices=False) p = u.T p_is_bm = False s = s0 ** 2 low_rank = n > m if low_rank: assert np.all(np.isfinite(s)) r = np.searchsorted(-s, -max_condition_number * s[0]) if r < m: info(f'from_random_effects: model rank reduced from {m} to {r} ' f'due to ill-condition.' f'\n Largest dropped eigenvalue was {s[r]}.') s = s[:r] p = p[:r, :] if p_path is not None: if p_is_bm: p.write(p_path, overwrite=overwrite) p = BlockMatrix.read(p_path) else: BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite) if p_is_bm: py, px = (p @ y.reshape(n, 1)).to_numpy().flatten(), (p @ x).to_numpy() else: py, px = p @ y, p @ x if low_rank: model = LinearMixedModel(py, px, s, y, x, p_path) else: model = LinearMixedModel(py, px, s, p_path=p_path) return model, p
def from_mixed_effects(cls, y, x, z, max_condition_number=1e-10): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> z = np.array([[0.0, 0.0, 1.0], ... [0.0, 1.0, 2.0], ... [1.0, 2.0, 4.0], ... [2.0, 4.0, 8.0]]) >>> model, p = LinearMixedModel.from_mixed_effects(y, x, z) >>> model.fit() >>> model.h_sq 0.38205307244271675 Notes ----- If :math:`n \leq m`, the returned model is full rank. If :math:`n < m`, the returned model is low rank. In this case only, eigenvalues less than or equal to `max_condition_number` times the top eigenvalue are dropped from :math:`S`, with the corresponding eigenvectors dropped from :math:`P`. This guards against precision loss on left eigenvectors computed via the right gramian :math:`Z^T Z` in :meth:`BlockMatrix.svd`. In either case, one can truncate to a rank :math:`r` model as follows: >>> s_r = model.s[:r] # doctest: +SKIP >>> p_r = p[:r, :] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) # doctest: +SKIP No standardization is applied to `z`. Warning ------- If `z` is a block matrix, then ideally `z` should be the result of directly reading from disk (and possibly a transpose). This is most critical if :math:`n > m`, because in this case multiplication by `z` will result in all preceding transformations being repeated ``n / block_size`` times, as explained in :class:`BlockMatrix`. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations :math:`y`. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects :math:`X`. z: :class:`ndarray` or :class:`BlockMatrix` :math:`n \times m` matrix of random effects :math:`Z`. max_condition_number: :obj:`float` Maximum condition number. Must be greater than 1e-16. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`Z`. p: :class:`ndarray` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. """ if max_condition_number < 1e-16: raise ValueError( "from_random_effects: 'max_condition_number' must " f"be at least 1e-16, found {max_condition_number}") _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(z, "z", 2) n, m = z.shape if y.shape[0] != n: raise ValueError("from_mixed_effects: 'y' and 'z' must have the " "same number of rows") if x.shape[0] != n: raise ValueError("from_mixed_effects: 'x' and 'z' must have the " "same number of rows") if isinstance(z, np.ndarray): u, s0, _ = hl.linalg._svd(z, full_matrices=False) p = u.T py, px = p @ y, p @ x else: u, s0, _ = z.svd() p = u.T py, px = (p @ y).to_numpy(), (p @ x).to_numpy() s = s0**2 full_rank = n <= m if full_rank: model = LinearMixedModel(py, px, s) else: assert np.all(np.isfinite(s)) r = np.searchsorted(-s, -max_condition_number * s[0]) if r < m: info(f'from_mixed_effects: model rank reduced from {m} to {r} ' f'due to ill-condition.' f'\n Largest dropped eigenvalue was {s[r]}.') s = s[:r] p = p[:r, :] model = LinearMixedModel(py, px, s, y, x) return model, p