def visit_tuple(self, node, visited_children): ttuple, _, paren, [maybe_types], paren = visited_children if not maybe_types: return hl.ttuple() else: [first, rest] = maybe_types return hl.ttuple(first, *(t for comma, t in rest))
def blockmatrix_irs(self): scalar_ir = ir.F64(2) vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64)) read = ir.BlockMatrixRead( ir.BlockMatrixNativeReader(resource('blockmatrix_example/0'))) add_two_bms = ir.BlockMatrixMap2( read, read, 'l', 'r', ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')), "Union") negate_bm = ir.BlockMatrixMap( read, 'element', ir.ApplyUnaryPrimOp('-', ir.Ref('element')), False) sqrt_bm = ir.BlockMatrixMap( read, 'element', hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir, False) persisted = ir.BlockMatrixRead(ir.BlockMatrixPersistReader('x', read)) scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1) col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1) row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1) broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256) broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256) broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256) transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256) matmul = ir.BlockMatrixDot(broadcast_scalar, transpose) rectangle = ir.Literal(hl.tarray(hl.tint64), [0, 1, 5, 6]) band = ir.Literal(hl.ttuple(hl.tint64, hl.tint64), (-1, 1)) intervals = ir.Literal( hl.ttuple(hl.tarray(hl.tint64), hl.tarray(hl.tint64)), ([0, 1, 5, 6], [5, 6, 8, 9])) sparsify1 = ir.BlockMatrixSparsify(read, rectangle, ir.RectangleSparsifier) sparsify2 = ir.BlockMatrixSparsify(read, band, ir.BandSparsifier(True)) sparsify3 = ir.BlockMatrixSparsify(read, intervals, ir.RowIntervalSparsifier(True)) densify = ir.BlockMatrixDensify(read) pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64)**construct_expr( ir.Ref('r'), hl.tfloat64))._ir squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, 'l', 'r', pow_ir, "NeedsDense") slice_bm = ir.BlockMatrixSlice( matmul, [slice(0, 2, 1), slice(0, 1, 1)]) return [ read, persisted, add_two_bms, negate_bm, sqrt_bm, scalar_to_bm, col_vector_to_bm, row_vector_to_bm, broadcast_scalar, broadcast_col, broadcast_row, squared_bm, transpose, sparsify1, sparsify2, sparsify3, densify, matmul, slice_bm ]
def parse_as_ranksum(string, has_non_ref): typ = hl.ttuple(hl.tfloat64, hl.tint32) items = string.split(r'\|') items = hl.cond(has_non_ref, items[:-1], items) return items.map(lambda s: hl.cond( (hl.len(s) == 0) | (s == '.'), hl.null(typ), hl.rbind(s.split(','), lambda ss: hl.cond( hl.len(ss) != 2, # bad field, possibly 'NaN', just set it null hl.null(hl.ttuple(hl.tfloat64, hl.tint32)), hl.tuple([hl.float64(ss[0]), hl.int32(ss[1])])))))
def test_parses(self): env = {'c': hl.tbool, 'a': hl.tarray(hl.tint32), 'aa': hl.tarray(hl.tarray(hl.tint32)), 'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)), 'v': hl.tint32, 's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64), 't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64), 'call': hl.tcall, 'x': hl.tint32} env = {name: t._parsable_string() for name, t in env.items()} for x in self.value_irs(): Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
def test_parses(self): env = {'c': hl.tbool, 'a': hl.tarray(hl.tint32), 'aa': hl.tarray(hl.tarray(hl.tint32)), 'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)), 'v': hl.tint32, 's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64), 't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64), 'call': hl.tcall, 'x': hl.tint32} env = {name: t._jtype for name, t in env.items()} for x in self.value_irs(): Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
def test_parses(self): env = {'c': hl.tbool, 'a': hl.tarray(hl.tint32), 'st': hl.tstream(hl.tint32), 'aa': hl.tarray(hl.tarray(hl.tint32)), 'sta': hl.tstream(hl.tarray(hl.tint32)), 'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)), 'nd': hl.tndarray(hl.tfloat64, 1), 'v': hl.tint32, 's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64), 't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64), 'call': hl.tcall, 'x': hl.tint32} for x in self.value_irs(): Env.spark_backend('ValueIRTests.test_parses')._parse_value_ir(str(x), env)
def qr(nd, mode="reduced"): """Performs a QR decomposition. :param nd: A 2 dimensional ndarray, shape(M, N) :param mode: One of "reduced", "complete", "r", or "raw". If K = min(M, N), then: - `reduced`: returns q and r with dimensions (M, K), (K, N) - `complete`: returns q and r with dimensions (M, M), (M, N) - `r`: returns only r with dimensions (K, N) - `raw`: returns h, tau with dimensions (N, M), (K,) Returns ------- - q: ndarray of float64 A matrix with orthonormal columns. - r: ndarray of float64 The upper-triangular matrix R. - (h, tau): ndarrays of float64 The array h contains the Householder reflectors that generate q along with r. The tau array contains scaling factors for the reflectors """ assert nd.ndim == 2, "QR decomposition requires 2 dimensional ndarray" if mode not in ["reduced", "r", "raw", "complete"]: raise ValueError(f"Unrecognized mode '{mode}' for QR decomposition") float_nd = nd.map(lambda x: hl.float64(x)) ir = NDArrayQR(float_nd._ir, mode) if mode == "raw": return construct_expr( ir, hl.ttuple(hl.tndarray(hl.tfloat64, 2), hl.tndarray(hl.tfloat64, 1))) elif mode == "r": return construct_expr(ir, hl.tndarray(hl.tfloat64, 2)) elif mode in ["complete", "reduced"]: return construct_expr( ir, hl.ttuple(hl.tndarray(hl.tfloat64, 2), hl.tndarray(hl.tfloat64, 2)))
def test_ndarray_mixed(): assert hl.eval( hl.null(hl.tndarray(hl.tint64, 2)).map(lambda x: x * x).reshape( (4, 5)).T) is None assert hl.eval((hl.nd.zeros((5, 10)).map(lambda x: x - 2) + hl.nd.ones( (5, 10)).map(lambda x: x + 5)).reshape( hl.null(hl.ttuple(hl.tint64, hl.tint64))).T.reshape( (10, 5))) is None assert hl.eval( hl.or_missing( False, hl.nd.array(np.arange(10)).reshape( (5, 2)).map(lambda x: x * 2)).map(lambda y: y * 2)) is None
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def _impute_type(x, partial_type): from hail.genetics import Locus, Call from hail.utils import Interval, Struct def refine(t, refined): if t is None: return refined if not isinstance(t, type(refined)): raise ExpressionException( "Incompatible partial_type, {}, for value {}".format( partial_type, x)) return t if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct) or isinstance(x, dict) and isinstance( partial_type, tstruct): partial_type = refine(partial_type, hl.tstruct()) t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x}) return t elif isinstance(x, tuple): partial_type = refine(partial_type, hl.ttuple()) return ttuple(*[ _impute_type( element, partial_type[index] if index < len(partial_type) else None) for index, element in enumerate(x) ]) elif isinstance(x, list): partial_type = refine(partial_type, hl.tarray(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif is_setlike(x): partial_type = refine(partial_type, hl.tset(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): user_partial_type = partial_type partial_type = refine(partial_type, hl.tdict(None, None)) if len(x) == 0: return partial_type kts = { _impute_type(element, partial_type.key_type) for element in x.keys() } vts = { _impute_type(element, partial_type.value_type) for element in x.values() } unified_key_type = super_unify_types(*kts) unified_value_type = super_unify_types(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys {} of types {} ".format( list(x.keys()), list(kts))) if not unified_value_type: if unified_key_type == hl.tstr and user_partial_type is None: return tstruct(**{k: _impute_type(x[k], None) for k in x}) raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None or pd.isna(x): return partial_type elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))
def test_ndarray_reshape(): np_single = np.array([8]) single = hl.nd.array([8]) np_zero_dim = np.array(4) zero_dim = hl.nd.array(4) np_a = np.array([1, 2, 3, 4, 5, 6]) a = hl.nd.array(np_a) np_cube = np.array([0, 1, 2, 3, 4, 5, 6, 7]).reshape((2, 2, 2)) cube = hl.nd.array([0, 1, 2, 3, 4, 5, 6, 7]).reshape((2, 2, 2)) cube_to_rect = cube.reshape((2, 4)) np_cube_to_rect = np_cube.reshape((2, 4)) cube_t_to_rect = cube.transpose((1, 0, 2)).reshape((2, 4)) np_cube_t_to_rect = np_cube.transpose((1, 0, 2)).reshape((2, 4)) np_hypercube = np.arange(3 * 5 * 7 * 9).reshape((3, 5, 7, 9)) hypercube = hl.nd.array(np_hypercube) np_shape_zero = np.array([]) shape_zero = hl.nd.array(np_shape_zero) assert_ndarrays_eq((single.reshape(()), np_single.reshape( ())), (zero_dim.reshape(()), np_zero_dim.reshape( ())), (zero_dim.reshape((1, )), np_zero_dim.reshape( (1, ))), (a.reshape((6, )), np_a.reshape((6, ))), (a.reshape( (2, 3)), np_a.reshape((2, 3))), (a.reshape( (3, 2)), np_a.reshape((3, 2))), (a.reshape( (3, -1)), np_a.reshape((3, -1))), (a.reshape( (-1, 2)), np_a.reshape( (-1, 2))), (cube_to_rect, np_cube_to_rect), (cube_t_to_rect, np_cube_t_to_rect), (hypercube.reshape( (5, 7, 9, 3)).reshape( (7, 9, 3, 5)), np_hypercube.reshape( (7, 9, 3, 5))), (hypercube.reshape(hl.tuple( [5, 7, 9, 3])), np_hypercube.reshape( (5, 7, 9, 3))), (shape_zero.reshape( (0, 5)), np_shape_zero.reshape((0, 5))), (shape_zero.reshape( (-1, 5)), np_shape_zero.reshape((-1, 5)))) assert hl.eval(hl.null(hl.tndarray(hl.tfloat, 2)).reshape((4, 5))) is None assert hl.eval( hl.nd.array(hl.range(20)).reshape( hl.null(hl.ttuple(hl.tint64, hl.tint64)))) is None with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((-1, -1))) assert "more than one -1" in str(exc) with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((20, ))) assert "requested shape is incompatible with number of elements" in str( exc) with pytest.raises(FatalError) as exc: hl.eval(a.reshape((3, ))) assert "requested shape is incompatible with number of elements" in str( exc) with pytest.raises(FatalError) as exc: hl.eval(a.reshape(())) assert "requested shape is incompatible with number of elements" in str( exc) with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((0, 2, 2))) assert "requested shape is incompatible with number of elements" in str( exc) with pytest.raises(FatalError) as exc: hl.eval(hl.literal(np_cube).reshape((2, 2, -2))) assert "must contain only nonnegative numbers or -1" in str(exc) with pytest.raises(FatalError) as exc: hl.eval(shape_zero.reshape((0, -1))) assert "Can't reshape" in str(exc)
def value_irs(self): b = ir.TrueIR() c = ir.Ref('c', hl.tbool) i = ir.I32(5) j = ir.I32(7) st = ir.Str('Hail') a = ir.Ref('a', hl.tarray(hl.tint32)) aa = ir.Ref('aa', hl.tarray(hl.tarray(hl.tint32))) da = ir.Ref('da', hl.tarray(hl.ttuple(hl.tint32, hl.tstr))) v = ir.Ref('v', hl.tint32) s = ir.Ref('s', hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64)) t = ir.Ref('t', hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64)) call = ir.Ref('call', hl.tcall) collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32]) call_stats_sig = ir.AggSignature('CallStats', [], [hl.tint32], [hl.tcall]) hist_sig = ir.AggSignature('Histogram', [hl.tfloat64, hl.tfloat64, hl.tint32], None, [hl.tfloat64]) take_by_sig = ir.AggSignature('TakeBy', [hl.tint32], None, [hl.tfloat64, hl.tfloat64]) value_irs = [ i, ir.I64(5), ir.F32(3.14), ir.F64(3.14), s, ir.TrueIR(), ir.FalseIR(), ir.Void(), ir.Cast(i, hl.tfloat64), ir.NA(hl.tint32), ir.IsNA(i), ir.If(b, i, j), ir.Let('v', i, v), ir.Ref('x', hl.tint32), ir.ApplyBinaryOp('+', i, j), ir.ApplyUnaryOp('-', i), ir.ApplyComparisonOp('EQ', i, j), ir.MakeArray([i, ir.NA(hl.tint32), ir.I32(-3)], hl.tarray(hl.tint32)), ir.ArrayRef(a, i), ir.ArrayLen(a), ir.ArrayRange(ir.I32(0), ir.I32(5), ir.I32(1)), ir.ArraySort(a, b, False), ir.ToSet(a), ir.ToDict(da), ir.ToArray(a), ir.LowerBoundOnOrderedCollection(a, i, True), ir.GroupByKey(da), ir.ArrayMap(a, 'v', v), ir.ArrayFilter(a, 'v', v), ir.ArrayFlatMap(aa, 'v', v), ir.ArrayFold(a, ir.I32(0), 'x', 'v', v), ir.ArrayScan(a, ir.I32(0), 'x', 'v', v), ir.ArrayFor(a, 'v', ir.Void()), ir.ApplyAggOp(ir.I32(0), [], None, collect_sig), ir.ApplyScanOp(ir.I32(0), [], None, collect_sig), ir.ApplyAggOp(ir.F64(-2.11), [ir.F64(-5.0), ir.F64(5.0), ir.I32(100)], None, hist_sig), ir.ApplyAggOp(call, [], [ir.I32(2)], call_stats_sig), ir.ApplyAggOp(ir.F64(-2.11), [ir.I32(10)], None, take_by_sig), ir.InitOp(ir.I32(0), [ir.I32(2)], call_stats_sig), ir.SeqOp(ir.I32(0), [i], collect_sig), ir.SeqOp(ir.I32(0), [ir.F64(-2.11), ir.I32(17)], take_by_sig), ir.Begin([ir.Void()]), ir.MakeStruct([('x', i)]), ir.SelectFields(s, ['x', 'z']), ir.InsertFields(s, [('x', i)]), ir.GetField(s, 'x'), ir.MakeTuple([i, b]), ir.GetTupleElement(t, 1), ir.StringSlice(st, ir.I32(1), ir.I32(2)), ir.StringLength(st), ir.In(2, hl.tfloat64), ir.Die('mumblefoo', hl.tfloat64), ir.Apply('&&', b, c), ir.Apply('toFloat64', i), ir.Apply('isDefined', s), ir.Uniroot('x', ir.F64(3.14), ir.F64(-5.0), ir.F64(5.0)), ir.Literal(hl.tarray(hl.tint32), [1, 2, None]), ] return value_irs