def _get_common_dtype(lhs_dtype, rhs_dtype): if lhs_dtype == rhs_dtype: return lhs_dtype if is_float_dtype(lhs_dtype) or is_float_dtype(rhs_dtype): return get_dtype(float) assert is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype) return get_dtype(int)
def _get_bin_op_res_type(self, op_name, lhs_dtype, rhs_dtype): """ Return the result data type for a binary operation. Parameters ---------- op_name : str A binary operation name. lhs_dtype : dtype A left operand's type. rhs_dtype : dtype A right operand's type. Returns ------- dtype """ if op_name in self.preserve_dtype_math_ops: return _get_common_dtype(lhs_dtype, rhs_dtype) elif op_name in self.promote_to_float_math_ops: return get_dtype(float) elif is_cmp_op(op_name): return get_dtype(bool) else: raise NotImplementedError( f"unsupported binary operation {op_name}")
def _get_common_dtype(lhs_dtype, rhs_dtype): """ Get data type for a binary operation result. Parameters ---------- lhs_dtype : dtype The type of the first operand. rhs_dtype : dtype The type of the second operand. Returns ------- dtype The result data type. """ if lhs_dtype == rhs_dtype: return lhs_dtype if is_float_dtype(lhs_dtype) and ( is_float_dtype(rhs_dtype) or is_integer_dtype(rhs_dtype) ): return get_dtype(float) if is_float_dtype(rhs_dtype) and ( is_float_dtype(lhs_dtype) or is_integer_dtype(lhs_dtype) ): return get_dtype(float) if is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype): return get_dtype(int) raise TypeError(f"Cannot perform operation on types: {lhs_dtype}, {rhs_dtype}")
def _agg_dtype(agg, dtype): if agg in _aggs_preserving_numeric_type: return dtype elif agg in _aggs_with_int_result: return get_dtype(int) elif agg in _aggs_with_float_result: return get_dtype(float) else: raise NotImplementedError(f"unsupported aggreagte {agg}")
def __init__(self, val): assert val is None or isinstance( val, (int, float, bool, str, np.int8, np.int16, np.int32, np.int64) ), f"unsupported literal value {val} of type {type(val)}" self.val = val if val is None: self._dtype = get_dtype(float) else: self._dtype = get_dtype(type(val))
def _get_bin_op_res_type(self, op_name, lhs_dtype, rhs_dtype): if op_name in self.preserve_dtype_math_ops: return _get_common_dtype(lhs_dtype, rhs_dtype) elif op_name in self.promote_to_float_math_ops: return get_dtype(float) elif is_cmp_op(op_name): return get_dtype(bool) else: raise NotImplementedError(f"unsupported binary operation {op_name}")
def build_row_idx_filter_expr(row_idx, row_col): """ Build an expression to filter rows by rowid. Parameters ---------- row_idx : int or list of int The row numeric indices to select. row_col : InputRefExpr The rowid column reference expression. Returns ------- BaseExpr The resulting filtering expression. """ if not is_list_like(row_idx): return row_col.eq(row_idx) exprs = [] for idx in row_idx: exprs.append(row_col.eq(idx)) res = OpExpr("OR", exprs, get_dtype(bool)) return res
def gen_reduce_expr(self): """ Generate an expression for a compound aggregate. Returns ------- BaseExpr A final compound aggregate expression. """ count_expr = self._builder._ref(self._arg.modin_frame, self._count_name) count_expr._dtype = get_dtype(int) sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name) sum_expr._dtype = self._sum_dtype qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name) qsum_expr._dtype = self._sum_dtype null_expr = LiteralExpr(None) count_or_null = build_if_then_else(count_expr.eq(LiteralExpr(0)), null_expr, count_expr, count_expr._dtype) count_m_1_or_null = build_if_then_else( count_expr.eq(LiteralExpr(1)), null_expr, count_expr.sub(LiteralExpr(1)), count_expr._dtype, ) # sqrt((sum(x * x) - sum(x) * sum(x) / n) / (n - 1)) return (qsum_expr.sub( sum_expr.mul(sum_expr).truediv(count_or_null)).truediv( count_m_1_or_null).pow(LiteralExpr(0.5)))
def gen_reduce_expr(self): count_expr = self._builder._ref(self._arg.modin_frame, self._count_name) count_expr._dtype = get_dtype(int) sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name) sum_expr._dtype = self._sum_dtype qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name) qsum_expr._dtype = self._sum_dtype csum_expr = self._builder._ref(self._arg.modin_frame, self._cube_sum_name) csum_expr._dtype = self._sum_dtype mean_expr = sum_expr.truediv(count_expr) # n * sqrt(n - 1) / (n - 2) # * (sum(x ** 3) - 3 * mean * sum(x * x) + 2 * mean * mean * sum(x)) # / (sum(x * x) - mean * sum(x)) ** 1.5 part1 = count_expr.mul( count_expr.sub(LiteralExpr(1)).pow(LiteralExpr(0.5)) ).truediv(count_expr.sub(LiteralExpr(2))) part2 = csum_expr.sub(mean_expr.mul(qsum_expr).mul(LiteralExpr(3.0))).add( mean_expr.mul(mean_expr).mul(sum_expr).mul(LiteralExpr(2.0)) ) part3 = qsum_expr.sub(mean_expr.mul(sum_expr)).pow(LiteralExpr(1.5)) skew_expr = part1.mul(part2).truediv(part3) # The result is NULL if n <= 2 return build_if_then_else( count_expr.le(LiteralExpr(2)), LiteralExpr(None), skew_expr, skew_expr._dtype, )
def gen_reduce_expr(self): count_expr = self._builder._ref(self._arg.modin_frame, self._count_name) count_expr._dtype = get_dtype(int) sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name) sum_expr._dtype = self._sum_dtype qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name) qsum_expr._dtype = self._sum_dtype null_expr = LiteralExpr(None) count_or_null = build_if_then_else( count_expr.eq(LiteralExpr(0)), null_expr, count_expr, count_expr._dtype ) count_m_1_or_null = build_if_then_else( count_expr.eq(LiteralExpr(1)), null_expr, count_expr.sub(LiteralExpr(1)), count_expr._dtype, ) # sqrt((sum(x * x) - sum(x) * sum(x) / n) / (n - 1)) return ( qsum_expr.sub(sum_expr.mul(sum_expr).truediv(count_or_null)) .truediv(count_m_1_or_null) .pow(LiteralExpr(0.5)) )
def bin_op(self, other, op_name): """ Build a binary operation expression. Parameters ---------- other : BaseExpr The second operand. op_name : str A binary operation name. Returns ------- BaseExpr The resulting binary operation expression. """ if op_name not in self.binary_operations: raise NotImplementedError(f"unsupported binary operation {op_name}") if is_cmp_op(op_name): return self._cmp_op(other, op_name) # True division may require prior cast to float to avoid integer division if op_name == "truediv": if is_integer_dtype(self._dtype) and is_integer_dtype(other._dtype): other = other.cast(get_dtype(float)) res_type = self._get_bin_op_res_type(op_name, self._dtype, other._dtype) new_expr = OpExpr(self.binary_operations[op_name], [self, other], res_type) # Floor division may require additional FLOOR expr. if op_name == "floordiv" and not is_integer_dtype(res_type): return new_expr.floor() return new_expr
def dtype(self): if self.block is None: raise AssertionError("Block is None, no dtype") if not self.needs_filling: return self.block.dtype else: return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0])
def _process_join(self, op): left = op.input[0] right = op.input[1] assert ( op.on is not None ), "Merge with unspecified 'on' parameter is not supported in the engine" for col in op.on: assert ( col in left._table_cols and col in right._table_cols ), f"Column '{col}'' is missing in one of merge operands" """ Join, only equal-join supported """ cmps = [self._ref(left, c).eq(self._ref(right, c)) for c in op.on] if len(cmps) > 1: condition = OpExpr("AND", cmps, get_dtype(bool)) else: condition = cmps[0] node = CalciteJoinNode( left_id=self._input_node(0).id, right_id=self._input_node(1).id, how=op.how, condition=condition, ) self._push(node) """Projection for both frames""" fields = [] exprs = [] conflicting_cols = set(left.columns) & set(right.columns) - set(op.on) """First goes 'on' column then all left columns(+suffix for conflicting names) but 'on' then all right columns(+suffix for conflicting names) but 'on'""" on_idx = [-1] * len(op.on) for c in left.columns: if c in op.on: on_idx[op.on.index(c)] = len(fields) suffix = op.suffixes[0] if c in conflicting_cols else "" fields.append(c + suffix) exprs.append(self._ref(left, c)) for c in right.columns: if c not in op.on: suffix = op.suffixes[1] if c in conflicting_cols else "" fields.append(c + suffix) exprs.append(self._ref(right, c)) self._push(CalciteProjectionNode(fields, exprs)) # TODO: current input translation system doesn't work here # because there is no frame to reference for index computation. # We should build calcite tree to keep references to input # nodes and keep scheme in calcite nodes. For now just use # known index on_idx. if op.sort is True: """Sort by key column""" collation = [CalciteCollation(CalciteInputIdxExpr(x)) for x in on_idx] self._push(CalciteSortNode(collation))
def floor(self): """ Build a floor expression. Returns ------- BaseExpr The resulting floor expression. """ return OpExpr("FLOOR", [self], get_dtype(int))
def is_not_null(self): """ Build a NOT NULL check expression. Returns ------- BaseExpr The NOT NULL check expression. """ new_expr = OpExpr("IS NOT NULL", [self], get_dtype(bool)) return new_expr
def _get_common_dtype(lhs_dtype, rhs_dtype): """ Get data type for a binary operation result. Parameters ---------- lhs_dtype : dtype The type of the first operand. rhs_dtype : dtype The type of the second operand. Returns ------- dtype The result data type. """ if lhs_dtype == rhs_dtype: return lhs_dtype if is_float_dtype(lhs_dtype) or is_float_dtype(rhs_dtype): return get_dtype(float) assert is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype) return get_dtype(int)
def _agg_dtype(agg, dtype): """ Compute aggregate data type. Parameters ---------- agg : str Aggregate name. dtype : dtype Operand data type. Returns ------- dtype The aggregate data type. """ if agg in _aggs_preserving_numeric_type: return dtype elif agg in _aggs_with_int_result: return get_dtype(int) elif agg in _aggs_with_float_result: return get_dtype(float) else: raise NotImplementedError(f"unsupported aggreagte {agg}")
def bin_op(self, other, op_name): if op_name not in self.binary_operations: raise NotImplementedError(f"unsupported binary operation {op_name}") if is_cmp_op(op_name): return self._cmp_op(other, op_name) # True division may require prior cast to float to avoid integer division if op_name == "truediv": if is_integer_dtype(self._dtype) and is_integer_dtype(other._dtype): other = other.cast(get_dtype(float)) res_type = self._get_bin_op_res_type(op_name, self._dtype, other._dtype) new_expr = OpExpr(self.binary_operations[op_name], [self, other], res_type) # Floor division may require additional FLOOR expr. if op_name == "floordiv" and not is_integer_dtype(res_type): return new_expr.floor() return new_expr
def le(self, other): """ Build a less or equal comparison with `other`. Parameters ---------- other : BaseExpr or scalar An operand to compare with. Returns ------- BaseExpr The resulting comparison expression. """ if not isinstance(other, BaseExpr): other = LiteralExpr(other) new_expr = OpExpr("<=", [self, other], get_dtype(bool)) return new_expr
def _get_counts_nanvar( value_counts: Tuple[int], mask: Optional[np.ndarray], axis: Optional[int], ddof: int, dtype: Dtype = float, ) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. Parameters ---------- values_shape : Tuple[int] shape tuple from values ndarray, used if mask is None mask : Optional[ndarray[bool]] locations in values that should be considered missing axis : Optional[int] axis to count along ddof : int degrees of freedom dtype : type, optional type to use for count Returns ------- count : scalar or array d : scalar or array """ dtype = get_dtype(dtype) count = _get_counts(value_counts, mask, axis, dtype=dtype) d = count - dtype.type(ddof) # always return NaN, never inf if is_scalar(count): if count <= ddof: count = np.nan d = np.nan else: mask2: np.ndarray = count <= ddof if mask2.any(): np.putmask(d, mask2, np.nan) np.putmask(count, mask2, np.nan) return count, d
def _get_counts( values_shape: Tuple[int, ...], mask: Optional[np.ndarray], axis: Optional[int], dtype: Dtype = float, ) -> Union[int, float, np.ndarray]: """ Get the count of non-null values along an axis Parameters ---------- values_shape : tuple of int shape tuple from values ndarray, used if mask is None mask : Optional[ndarray[bool]] locations in values that should be considered missing axis : Optional[int] axis to count along dtype : type, optional type to use for count Returns ------- count : scalar or array """ dtype = get_dtype(dtype) if axis is None: if mask is not None: n = mask.size - mask.sum() else: n = np.prod(values_shape) return dtype.type(n) if mask is not None: count = mask.shape[axis] - mask.sum(axis) else: count = values_shape[axis] if is_scalar(count): return dtype.type(count) try: return count.astype(dtype) except AttributeError: return np.array(count, dtype=dtype)
def _cmp_op(self, other, op_name): lhs_dtype_class = self._get_dtype_cmp_class(self._dtype) rhs_dtype_class = self._get_dtype_cmp_class(other._dtype) res_dtype = get_dtype(bool) # In OmniSci comparison with NULL always results in NULL, # but in Pandas it is True for 'ne' comparison and False # for others. # Also Pandas allow 'eq' and 'ne' comparison for values # of incompatible types which doesn't work in OmniSci. if lhs_dtype_class != rhs_dtype_class: if op_name == "eq" or op_name == "ne": return LiteralExpr(op_name == "ne") else: raise TypeError( f"Invalid comparison between {self._dtype} and {other._dtype}" ) else: cmp = OpExpr(self.binary_operations[op_name], [self, other], res_dtype) return build_if_then_else( self.is_null(), LiteralExpr(op_name == "ne"), cmp, res_dtype )
def build_dt_expr(dt_operation, col_expr): """ Build a datetime extraction expression. Parameters ---------- dt_operation : str Datetime field to extract. col_expr : BaseExpr An expression to extract from. Returns ------- BaseExpr The extract expression. """ operation = LiteralExpr(dt_operation) res = OpExpr("PG_EXTRACT", [operation, col_expr], get_dtype(int)) return res
def _cmp_op(self, other, op_name): """ Build a comparison expression. Parameters ---------- other : BaseExpr A value to compare with. op_name : str The comparison operation name. Returns ------- BaseExpr The resulting comparison expression. """ lhs_dtype_class = self._get_dtype_cmp_class(self._dtype) rhs_dtype_class = self._get_dtype_cmp_class(other._dtype) res_dtype = get_dtype(bool) # In OmniSci comparison with NULL always results in NULL, # but in pandas it is True for 'ne' comparison and False # for others. # Also pandas allows 'eq' and 'ne' comparison for values # of incompatible types which doesn't work in OmniSci. if lhs_dtype_class != rhs_dtype_class: if op_name == "eq" or op_name == "ne": return LiteralExpr(op_name == "ne") else: raise TypeError( f"Invalid comparison between {self._dtype} and {other._dtype}" ) else: cmp = OpExpr(self.binary_operations[op_name], [self, other], res_dtype) return build_if_then_else(self.is_null(), LiteralExpr(op_name == "ne"), cmp, res_dtype)
def build_row_idx_filter_expr(row_idx, row_col): """Build calcite expression to filter rows by rowid. Parameters ---------- row_idx The row numeric indices to select row_col InputRefExpr referencing proper rowid column to filter by Returns ------- CalciteBaseExpr A BaseExpr implementing filter condition """ if not is_list_like(row_idx): return row_col.eq(row_idx) exprs = [] for idx in row_idx: exprs.append(row_col.eq(idx)) res = OpExpr("OR", exprs, get_dtype(bool)) return res
def test_get_dtype_fails(input_param, expected_error_message): # python objects # 2020-02-02 npdev changed error message expected_error_message += f"|Cannot interpret '{input_param}' as a data type" with pytest.raises(TypeError, match=expected_error_message): com.get_dtype(input_param)
def test_get_dtype(input_param, result): assert com.get_dtype(input_param) == result
def build_dt_expr(dt_operation, col_expr): operation = LiteralExpr(dt_operation) res = OpExpr("PG_EXTRACT", [operation, col_expr], get_dtype(int)) return res
def le(self, other): if not isinstance(other, BaseExpr): other = LiteralExpr(other) new_expr = OpExpr("<=", [self, other], get_dtype(bool)) return new_expr
def is_null(self): new_expr = OpExpr("IS NULL", [self], get_dtype(bool)) return new_expr