def super_unify_types(*ts): ts = [t for t in ts if t is not None] if len(ts) == 0: return None t0 = ts[0] if all(is_numeric(t) for t in ts): return unify_types_limited(*ts) if any(not isinstance(t, type(t0)) for t in ts): return None if isinstance(t0, tarray): et = super_unify_types(*[t.element_type for t in ts]) return tarray(et) if isinstance(t0, tset): et = super_unify_types(*[t.element_type for t in ts]) return tset(et) if isinstance(t0, tdict): kt = super_unify_types(*[t.key_type for t in ts]) vt = super_unify_types(*[t.value_type for t in ts]) return tdict(kt, vt) if isinstance(t0, tstruct): keys = [k for t in ts for k in t.fields] kvs = { k: super_unify_types(*[t.get(k, None) for t in ts]) for k in keys } return tstruct(**kvs) if all(t0 == t for t in ts): return t0 return None
def _get_other_cols(row: StructExpression) -> List[Column]: assert check_argument_types() other_cols = [] if 'cm_position' in row and row.cm_position.dtype == tfloat64: other_cols.append(fx.col("cm_position").alias("position")) if 'qual' in row and row.qual.dtype == tfloat64: # -10 qual means missing other_cols.append(fx.expr("if(qual = -10, null, qual)").alias("qual")) # [] filters means PASS, null filters means missing if 'filters' in row and row.filters.dtype == tset(tstr): other_cols.append(fx.expr("if(size(filters) = 0, array('PASS'), filters)").alias("filters")) # Rename info.* columns to INFO_* if 'info' in row and isinstance(row.info.dtype, tstruct): for f in row.info: other_cols.append(fx.col(f"`info.{f}`").alias(f"INFO_{f}")) assert check_return_type(other_cols) return other_cols
def _impute_type(x, partial_type): from hail.genetics import Locus, Call from hail.utils import Interval, Struct def refine(t, refined): if t is None: return refined if not isinstance(t, type(refined)): raise ExpressionException( "Incompatible partial_type, {}, for value {}".format( partial_type, x)) return t if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct) or isinstance(x, dict) and isinstance( partial_type, tstruct): partial_type = refine(partial_type, hl.tstruct()) t = tstruct(**{k: _impute_type(x[k], partial_type.get(k)) for k in x}) return t elif isinstance(x, tuple): partial_type = refine(partial_type, hl.ttuple()) return ttuple(*[ _impute_type( element, partial_type[index] if index < len(partial_type) else None) for index, element in enumerate(x) ]) elif isinstance(x, list): partial_type = refine(partial_type, hl.tarray(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif is_setlike(x): partial_type = refine(partial_type, hl.tset(None)) if len(x) == 0: return partial_type ts = { _impute_type(element, partial_type.element_type) for element in x } unified_type = super_unify_types(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): user_partial_type = partial_type partial_type = refine(partial_type, hl.tdict(None, None)) if len(x) == 0: return partial_type kts = { _impute_type(element, partial_type.key_type) for element in x.keys() } vts = { _impute_type(element, partial_type.value_type) for element in x.values() } unified_key_type = super_unify_types(*kts) unified_value_type = super_unify_types(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys {} of types {} ".format( list(x.keys()), list(kts))) if not unified_value_type: if unified_key_type == hl.tstr and user_partial_type is None: return tstruct(**{k: _impute_type(x[k], None) for k in x}) raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None or pd.isna(x): return partial_type elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))
def impute_type(x): from hail.genetics import Locus, Call from hail.utils import Interval, Struct if isinstance(x, Expression): return x.dtype elif isinstance(x, bool): return tbool elif isinstance(x, int): if hl.tint32.min_value <= x <= hl.tint32.max_value: return tint32 elif hl.tint64.min_value <= x <= hl.tint64.max_value: return tint64 else: raise ValueError( "Hail has no integer data type large enough to store {}". format(x)) elif isinstance(x, float): return tfloat64 elif isinstance(x, str): return tstr elif isinstance(x, Locus): return tlocus(x.reference_genome) elif isinstance(x, Interval): return tinterval(x.point_type) elif isinstance(x, Call): return tcall elif isinstance(x, Struct): return tstruct(**{k: impute_type(x[k]) for k in x}) elif isinstance(x, tuple): return ttuple(*(impute_type(element) for element in x)) elif isinstance(x, list): if len(x) == 0: raise ExpressionException( "Cannot impute type of empty list. Use 'hl.empty_array' to create an empty array." ) ts = {impute_type(element) for element in x} unified_type = unify_types_limited(*ts) if unified_type is None: raise ExpressionException( "Hail does not support heterogeneous arrays: " "found list with elements of types {} ".format(list(ts))) return tarray(unified_type) elif isinstance(x, set): if len(x) == 0: raise ExpressionException( "Cannot impute type of empty set. Use 'hl.empty_set' to create an empty set." ) ts = {impute_type(element) for element in x} unified_type = unify_types_limited(*ts) if not unified_type: raise ExpressionException( "Hail does not support heterogeneous sets: " "found set with elements of types {} ".format(list(ts))) return tset(unified_type) elif isinstance(x, Mapping): if len(x) == 0: raise ExpressionException( "Cannot impute type of empty dict. Use 'hl.empty_dict' to create an empty dict." ) kts = {impute_type(element) for element in x.keys()} vts = {impute_type(element) for element in x.values()} unified_key_type = unify_types_limited(*kts) unified_value_type = unify_types_limited(*vts) if not unified_key_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with keys of types {} ".format(list(kts))) if not unified_value_type: raise ExpressionException( "Hail does not support heterogeneous dicts: " "found dict with values of types {} ".format(list(vts))) return tdict(unified_key_type, unified_value_type) elif isinstance(x, np.generic): return from_numpy(x.dtype) elif isinstance(x, np.ndarray): element_type = from_numpy(x.dtype) return tndarray(element_type, x.ndim) elif x is None: raise ExpressionException("Hail cannot impute the type of 'None'") elif isinstance( x, (hl.expr.builders.CaseBuilder, hl.expr.builders.SwitchBuilder)): raise ExpressionException( "'switch' and 'case' expressions must end with a call to either" "'default' or 'or_missing'") else: raise ExpressionException( "Hail cannot automatically impute type of {}: {}".format( type(x), x))