def svd(nd, full_matrices=True, compute_uv=True): """Performs a singular value decomposition. :param nd: :class:`.NDArrayExpression` A 2 dimensional ndarray, shape(M, N). :param full_matrices: `bool` If True (default), u and vt have dimensions (M, M) and (N, N) respectively. Otherwise, they have dimensions (M, K) and (K, N), where K = min(M, N) :param compute_uv: `bool` If True (default), compute the singular vectors u and v. Otherwise, only return a single ndarray, s. Returns ------- - u: :class:`.NDArrayExpression` The left singular vectors. - s: :class:`.NDArrayExpression` The singular values. - vt: :class:`.NDArrayExpression` The right singular vectors. """ float_nd = nd.map(lambda x: hl.float64(x)) ir = NDArraySVD(float_nd._ir, full_matrices, compute_uv) return_type = ttuple(tndarray(tfloat64, 2), tndarray(tfloat64, 1), tndarray(tfloat64, 2)) if compute_uv else tndarray( tfloat64, 1) return construct_expr(ir, return_type)
def qr(nd, mode="reduced"): """Performs a QR decomposition. :param nd: A 2 dimensional ndarray, shape(M, N) :param mode: One of "reduced", "complete", "r", or "raw". If K = min(M, N), then: - `reduced`: returns q and r with dimensions (M, K), (K, N) - `complete`: returns q and r with dimensions (M, M), (M, N) - `r`: returns only r with dimensions (K, N) - `raw`: returns h, tau with dimensions (N, M), (K,) Returns ------- - q: ndarray of float64 A matrix with orthonormal columns. - r: ndarray of float64 The upper-triangular matrix R. - (h, tau): ndarrays of float64 The array h contains the Householder reflectors that generate q along with r. The tau array contains scaling factors for the reflectors """ assert nd.ndim == 2, "QR decomposition requires 2 dimensional ndarray" if mode not in ["reduced", "r", "raw", "complete"]: raise ValueError(f"Unrecognized mode '{mode}' for QR decomposition") float_nd = nd.map(lambda x: hl.float64(x)) ir = NDArrayQR(float_nd._ir, mode) indices = nd._indices aggs = nd._aggregations if mode == "raw": return construct_expr( ir, ttuple(tndarray(tfloat64, 2), tndarray(tfloat64, 1)), indices, aggs) elif mode == "r": return construct_expr(ir, tndarray(tfloat64, 2), indices, aggs) elif mode in ["complete", "reduced"]: return construct_expr( ir, ttuple(tndarray(tfloat64, 2), tndarray(tfloat64, 2)), indices, aggs)
def _impute_type(x, partial_type): from hail.genetics import Locus, Call from hail.utils import Interval, Struct def refine(t, refined): if t is None: return refined if not isinstance(t, type(refined)): raise ExpressionException( "Incompatible partial_type, {}, for value {}".format( partial_type, x)) return t if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct) or isinstance(x, dict) and isinstance( partial_type, tstruct): partial_type = refine(partial_type, hl.tstruct()) t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x}) return t elif isinstance(x, tuple): partial_type = refine(partial_type, hl.ttuple()) return ttuple(*[ _impute_type( element, partial_type[index] if index < len(partial_type) else None) for index, element in enumerate(x) ]) elif isinstance(x, list): partial_type = refine(partial_type, hl.tarray(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif is_setlike(x): partial_type = refine(partial_type, hl.tset(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): user_partial_type = partial_type partial_type = refine(partial_type, hl.tdict(None, None)) if len(x) == 0: return partial_type kts = { _impute_type(element, partial_type.key_type) for element in x.keys() } vts = { _impute_type(element, partial_type.value_type) for element in x.values() } unified_key_type = super_unify_types(*kts) unified_value_type = super_unify_types(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys {} of types {} ".format( list(x.keys()), list(kts))) if not unified_value_type: if unified_key_type == hl.tstr and user_partial_type is None: return tstruct(**{k: _impute_type(x[k], None) for k in x}) raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None or pd.isna(x): return partial_type elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))
def impute_type(x): from hail.genetics import Locus, Call from hail.utils import Interval, Struct if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct): return tstruct(**{k: impute_type(x[k]) for k in x}) elif isinstance(x, tuple): return ttuple(*(impute_type(element) for element in x)) elif isinstance(x, list): if len(x) == 0: raise ExpressionException( "Cannot impute type of empty list. Use 'hl.empty_array' to create an empty array." ) ts = {impute_type(element) for element in x} unified_type = unify_types_limited(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif isinstance(x, set): if len(x) == 0: raise ExpressionException( "Cannot impute type of empty set. Use 'hl.empty_set' to create an empty set." ) ts = {impute_type(element) for element in x} unified_type = unify_types_limited(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): if len(x) == 0: raise ExpressionException( "Cannot impute type of empty dict. Use 'hl.empty_dict' to create an empty dict." ) kts = {impute_type(element) for element in x.keys()} vts = {impute_type(element) for element in x.values()} unified_key_type = unify_types_limited(*kts) unified_value_type = unify_types_limited(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys of types {} ".format(list(kts))) if not unified_value_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None: raise ExpressionException("Hail cannot impute the type of 'None'") elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))
def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Table: """Return a table containing the vertices in a near `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_ of an undirected graph whose edges are given by a two-column table. Examples -------- Run PC-relate and compute pairs of closely related individuals: >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin') >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125) Starting from the above pairs, prune individuals from a dataset until no close relationships remain: >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) >>> result = dataset.filter_cols( ... hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False) Starting from the above pairs, prune individuals from a dataset until no close relationships remain, preferring to keep cases over controls: >>> samples = dataset.cols() >>> pairs_with_case = pairs.key_by( ... i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case), ... j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case)) >>> def tie_breaker(l, r): ... return hl.cond(l.is_case & ~r.is_case, -1, ... hl.cond(~l.is_case & r.is_case, 1, 0)) >>> related_samples_to_remove = hl.maximal_independent_set( ... pairs_with_case.i, pairs_with_case.j, False, tie_breaker) >>> result = dataset.filter_cols(hl.is_defined( ... related_samples_to_remove.key_by( ... s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False) Notes ----- The vertex set of the graph is implicitly all the values realized by `i` and `j` on the rows of this table. Each row of the table corresponds to an undirected edge between the vertices given by evaluating `i` and `j` on that row. An undirected edge may appear multiple times in the table and will not affect the output. Vertices with self-edges are removed as they are not independent of themselves. The expressions for `i` and `j` must have the same type. The value of `keep` determines whether the vertices returned are those in the maximal independent set, or those in the complement of this set. This is useful if you need to filter a table without removing vertices that don't appear in the graph at all. This method implements a greedy algorithm which iteratively removes a vertex of highest degree until the graph contains no edges. The greedy algorithm always returns an independent set, but the set may not always be perfectly maximal. `tie_breaker` is a Python function taking two arguments---say `l` and `r`---each of which is an :class:`Expression` of the same type as `i` and `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an ordering on nodes. A pair of nodes can be ordered in one of three ways, and `tie_breaker` must encode the relationship as follows: - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer - if ``l == r`` then ``tie_breaker`` evaluates to 0 - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer For example, the usual ordering on the integers is defined by: ``l - r``. The `tie_breaker` function must satisfy the following property: ``tie_breaker(l, r) == -tie_breaker(r, l)``. When multiple nodes have the same degree, this algorithm will order the nodes according to ``tie_breaker`` and remove the *largest* node. If `keyed` is ``False``, then a node may appear twice in the resulting table. Parameters ---------- i : :class:`.Expression` Expression to compute one endpoint of an edge. j : :class:`.Expression` Expression to compute another endpoint of an edge. keep : :obj:`bool` If ``True``, return vertices in set. If ``False``, return vertices removed. tie_breaker : function Function used to order nodes with equal degree. keyed : :obj:`bool` If ``True``, key the resulting table by the `node` field, this requires a sort. Returns ------- :class:`.Table` Table with the set of independent vertices. The table schema is one row field `node` which has the same type as input expressions `i` and `j`. """ if i.dtype != j.dtype: raise ValueError("'maximal_independent_set' expects arguments `i` and `j` to have same type. " "Found {} and {}.".format(i.dtype, j.dtype)) source = i._indices.source if not isinstance(source, Table): raise ValueError("'maximal_independent_set' expects an expression of 'Table'. Found {}".format( "expression of '{}'".format( source.__class__) if source is not None else 'scalar expression')) if i._indices.source != j._indices.source: raise ValueError( "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. " "Found\n{}\n{}".format(i, j)) node_t = i.dtype if tie_breaker: wrapped_node_t = ttuple(node_t) left = construct_variable('l', wrapped_node_t) right = construct_variable('r', wrapped_node_t) tie_breaker_expr = hl.float64(tie_breaker(left[0], right[0])) t, _ = source._process_joins(i, j, tie_breaker_expr) tie_breaker_str = str(tie_breaker_expr._ir) else: t, _ = source._process_joins(i, j) tie_breaker_str = None edges = t.select(__i=i, __j=j).key_by().select('__i', '__j') edges_path = new_temp_file() edges.write(edges_path) edges = hl.read_table(edges_path) mis_nodes = construct_expr( ir.JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet( Env.spark_backend('maximal_independent_set')._to_java_value_ir(edges.collect(_localize=False)._ir), node_t._parsable_string(), tie_breaker_str)), hl.tset(node_t)) nodes = edges.select(node=[edges.__i, edges.__j]) nodes = nodes.explode(nodes.node) nodes = nodes.annotate_globals(mis_nodes=mis_nodes) nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep) nodes = nodes.select_globals() if keyed: return nodes.key_by('node').distinct() return nodes