def _group_by(self, group, agg_expr): if group._aggregations: raise ExpressionException( "'group_by' does not support an already-aggregated expression as the argument to 'group'" ) if isinstance(agg_expr._ir, ApplyScanOp): if not self._as_scan: raise TypeError( "'agg.group_by' requires a non-scan aggregation expression (agg.*) as the argument to 'agg_expr'" ) elif isinstance(agg_expr._ir, ApplyAggOp): if self._as_scan: raise TypeError( "'scan.group_by' requires a scan aggregation expression (scan.*) as the argument to 'agg_expr'" ) elif not isinstance(agg_expr._ir, ApplyAggOp) and not isinstance( agg_expr._ir, ApplyScanOp): raise TypeError( "'group_by' requires an aggregation expression as the argument to 'agg_expr'" ) ir = agg_expr._ir agg_sig = ir.agg_sig a = ir.a new_agg_sig = AggSignature(f'Keyed({agg_sig.op})', agg_sig.ctor_arg_types, agg_sig.initop_arg_types, [group.dtype] + agg_sig.seqop_arg_types) def rewrite_a(ir): if isinstance(ir, SeqOp): return SeqOp(ir.i, [group._ir] + ir.args, new_agg_sig) else: return ir.map_ir(rewrite_a) new_a = rewrite_a(a) if isinstance(agg_expr._ir, ApplyAggOp): ir = ApplyAggOp(new_a, ir.constructor_args, ir.init_op_args, new_agg_sig) else: assert isinstance(agg_expr._ir, ApplyScanOp) ir = ApplyScanOp(new_a, ir.constructor_args, ir.init_op_args, new_agg_sig) return construct_expr(ir, hl.tdict(group.dtype, agg_expr.dtype), agg_expr._indices, agg_expr._aggregations)
def values(self): values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), { "a": 0, "b": 1, "c": 4 }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1]))] return values
def values(self): values = [ (hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), {"a": 0, "b": 1, "c": 4}), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1])) ] return values
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def visit_dict(self, node, visited_children): tdict, _, angle_bracket, kt, comma, vt, angle_bracket = visited_children return hl.tdict(kt, vt)
def _impute_type(x, partial_type): from hail.genetics import Locus, Call from hail.utils import Interval, Struct def refine(t, refined): if t is None: return refined if not isinstance(t, type(refined)): raise ExpressionException( "Incompatible partial_type, {}, for value {}".format( partial_type, x)) return t if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct) or isinstance(x, dict) and isinstance( partial_type, tstruct): partial_type = refine(partial_type, hl.tstruct()) t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x}) return t elif isinstance(x, tuple): partial_type = refine(partial_type, hl.ttuple()) return ttuple(*[ _impute_type( element, partial_type[index] if index < len(partial_type) else None) for index, element in enumerate(x) ]) elif isinstance(x, list): partial_type = refine(partial_type, hl.tarray(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif is_setlike(x): partial_type = refine(partial_type, hl.tset(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): user_partial_type = partial_type partial_type = refine(partial_type, hl.tdict(None, None)) if len(x) == 0: return partial_type kts = { _impute_type(element, partial_type.key_type) for element in x.keys() } vts = { _impute_type(element, partial_type.value_type) for element in x.values() } unified_key_type = super_unify_types(*kts) unified_value_type = super_unify_types(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys {} of types {} ".format( list(x.keys()), list(kts))) if not unified_value_type: if unified_key_type == hl.tstr and user_partial_type is None: return tstruct(**{k: _impute_type(x[k], None) for k in x}) raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None or pd.isna(x): return partial_type elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))
def compute_quantile_bin( ht: hl.Table, score_expr: hl.expr.NumericExpression, bin_expr: Dict[str, hl.expr.BooleanExpression] = {"bin": True}, compute_snv_indel_separately: bool = True, n_bins: int = 100, k: int = 1000, desc: bool = True, ) -> hl.Table: """ Returns a table with a bin for each row based on quantiles of `score_expr`. The bin is computed by dividing the `score_expr` into `n_bins` bins containing an equal number of elements. This is done based on quantiles computed with hl.agg.approx_quantiles. If a single value in `score_expr` spans more than one bin, the rows with this value are distributed randomly across the bins it spans. If `compute_snv_indel_separately` is True all items in `bin_expr` will be stratified by snv / indels for the bin calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For example if we have the following four variants and scores and `n_bins` of 2: ======== ======= ====== ================= ================= Variant Type Score bin - `compute_snv_indel_separately`: -------- ------- ------ ------------------------------------- \ \ \ False True ======== ======= ====== ================= ================= Var1 SNV 0.1 1 1 Var2 SNV 0.2 1 2 Var3 Indel 0.3 2 1 Var4 Indel 0.4 2 2 ======== ======= ====== ================= ================= .. note:: The `bin_expr` defines which data the bin(s) should be computed on. E.g., to get a biallelic quantile bin and an singleton quantile bin, the following could be used: .. code-block:: python bin_expr={ 'biallelic_bin': ~ht.was_split, 'singleton_bin': ht.singleton } :param ht: Input Table :param score_expr: Expression containing the score :param bin_expr: Quantile bin(s) to be computed (see notes) :param compute_snv_indel_separately: Should all `bin_expr` items be stratified by snv / indels :param n_bins: Number of bins to bin the data into :param k: The `k` parameter of approx_quantiles :param desc: Whether to bin the score in descending order :return: Table with the quantile bins """ import math def quantiles_to_bin_boundaries(quantiles: List[int]) -> Dict: """ Merges bins with the same boundaries into a unique bin while keeping track of which bins have been merged and the global index of all bins. :param quantiles: Original bins boundaries :return: (dict of the indices of bins for which multiple bins were collapsed -> number of bins collapsed, Global indices of merged bins, Merged bins boundaries) """ # Pad the quantiles to create boundaries for the first and last bins bin_boundaries = [-math.inf] + quantiles + [math.inf] merged_bins = defaultdict(int) # If every quantile has a unique value, then bin boudaries are unique # and can be passed to binary_search as-is if len(quantiles) == len(set(quantiles)): return dict( merged_bins=merged_bins, global_bin_indices=list(range(len(bin_boundaries))), bin_boundaries=bin_boundaries, ) indexed_bins = list(enumerate(bin_boundaries)) i = 1 while i < len(indexed_bins): if indexed_bins[i - 1][1] == indexed_bins[i][1]: merged_bins[i - 1] += 1 indexed_bins.pop(i) else: i += 1 return dict( merged_bins=merged_bins, global_bin_indices=[x[0] for x in indexed_bins], bin_boundaries=[x[1] for x in indexed_bins], ) if compute_snv_indel_separately: # For each bin, add a SNV / indel stratification bin_expr = { f"{bin_id}_{snv}": (bin_expr & snv_expr) for bin_id, bin_expr in bin_expr.items() for snv, snv_expr in [ ("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])), ("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])), ] } print("ADSADSADASDAS") print(bin_expr) bin_ht = ht.annotate( **{ f"_filter_{bin_id}": bin_expr for bin_id, bin_expr in bin_expr.items() }, _score=score_expr, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), ) print(bin_ht.show()) logger.info( f"Adding quantile bins using approximate_quantiles binned into {n_bins}, using k={k}" ) bin_stats = bin_ht.aggregate( hl.struct( **{ bin_id: hl.agg.filter( bin_ht[f"_filter_{bin_id}"], hl.struct( n=hl.agg.count(), quantiles=hl.agg.approx_quantiles( bin_ht._score, [x / (n_bins) for x in range(1, n_bins)], k=k), ), ) for bin_id in bin_expr })) # Take care of bins with duplicated boundaries bin_stats = bin_stats.annotate( **{ rname: bin_stats[rname].annotate( **quantiles_to_bin_boundaries(bin_stats[rname].quantiles)) for rname in bin_stats }) bin_ht = bin_ht.annotate_globals(bin_stats=hl.literal( bin_stats, dtype=hl.tstruct( **{ bin_id: hl.tstruct( n=hl.tint64, quantiles=hl.tarray(hl.tfloat64), bin_boundaries=hl.tarray(hl.tfloat64), global_bin_indices=hl.tarray(hl.tint32), merged_bins=hl.tdict(hl.tint32, hl.tint32), ) for bin_id in bin_expr }), )) # Annotate the bin as the index in the unique boundaries array bin_ht = bin_ht.annotate( **{ bin_id: hl.or_missing( bin_ht[f"_filter_{bin_id}"], hl.binary_search(bin_ht.bin_stats[bin_id].bin_boundaries, bin_ht._score), ) for bin_id in bin_expr }) # Convert the bin to global bin by expanding merged bins, that is: # If a value falls in a bin that needs expansion, assign it randomly to one of the expanded bins # Otherwise, simply modify the bin to its global index (with expanded bins that is) bin_ht = bin_ht.select( "snv", **{ bin_id: hl.if_else( bin_ht.bin_stats[bin_id].merged_bins.contains(bin_ht[bin_id]), bin_ht.bin_stats[bin_id].global_bin_indices[bin_ht[bin_id]] + hl.int( hl.rand_unif( 0, bin_ht.bin_stats[bin_id].merged_bins[bin_ht[bin_id]] + 1)), bin_ht.bin_stats[bin_id].global_bin_indices[bin_ht[bin_id]], ) for bin_id in bin_expr }, ) if desc: bin_ht = bin_ht.annotate( **{bin_id: n_bins - bin_ht[bin_id] for bin_id in bin_expr}) # Because SNV and indel rows are mutually exclusive, re-combine them into a single bin. # Update the global bin_stats struct to reflect the change in bin names in the table if compute_snv_indel_separately: bin_expr_no_snv = { bin_id.rsplit("_", 1)[0] for bin_id in bin_ht.bin_stats } bin_ht = bin_ht.annotate_globals(bin_stats=hl.struct( **{ bin_id: hl.struct( **{ snv: bin_ht.bin_stats[f"{bin_id}_{snv}"] for snv in ["snv", "indel"] }) for bin_id in bin_expr_no_snv })) bin_ht = bin_ht.transmute( **{ bin_id: hl.if_else( bin_ht.snv, bin_ht[f"{bin_id}_snv"], bin_ht[f"{bin_id}_indel"], ) for bin_id in bin_expr_no_snv }) return bin_ht