def case(self, *operands) -> SeriesOrScalar: """ Returns `then` where `where`, else `other`. """ assert operands where = operands[0] then = operands[1] if len(operands) > 3: other = self.case(*operands[2:]) else: other = operands[2] if is_frame(then): return then.where(where, other=other) elif is_frame(other): return other.where(~where, other=then) elif is_frame(where): # This one is a bit tricky. # Everything except "where" are scalars. # To make the "df.where" function still usable # we create a temporary dataframe with the # properties of where (but the content of then). tmp = where.apply(lambda x: then, meta=(where.name, type(then))) return tmp.where(where, other=other) else: return then if where else other
def not_( self, df: SeriesOrScalar, ) -> SeriesOrScalar: """ Returns not `df` (where `df` can also be just a scalar). """ if is_frame(df): return ~(df.astype("boolean")) else: return not df
def null( self, df: SeriesOrScalar, ) -> SeriesOrScalar: """ Returns true where `df` is null (where `df` can also be just a scalar). """ if is_frame(df): return df.isna() return pd.isna(df) or df is None or np.isnan(df)
def position(self, search, s, start=None): """Attention: SQL starts counting at 1""" if is_frame(s): s = s.str if start is None or start <= 0: start = 0 else: start -= 1 return s.find(search, start) + 1
def not_( self, df: Union[dd.Series, Any], ) -> Union[dd.Series, Any]: """ Returns not `df` (where `df` can also be just a scalar). """ if is_frame(df): return ~(df.astype("boolean")) else: return not df # pragma: no cover
def null( self, df: Union[dd.Series, Any], ) -> Union[dd.Series, Any]: """ Returns true where `df` is null (where `df` can also be just a scalar). """ if is_frame(df): return df.isna() return pd.isna(df) or df is None or np.isnan(df)
def true_( self, df: SeriesOrScalar, ) -> SeriesOrScalar: """ Returns true where `df` is true (where `df` can also be just a scalar). Returns false on nan. """ if is_frame(df): return df.fillna(False) return not pd.isna(df) and df is not None and not np.isnan(df) and bool(df)
def trim(self, flags, search, s): if is_frame(s): s = s.str if flags == "LEADING": strip_call = s.lstrip elif flags == "TRAILING": strip_call = s.rstrip else: strip_call = s.strip return strip_call(search)
def false_( self, df: Union[dd.Series, Any], ) -> Union[dd.Series, Any]: """ Returns true where `df` is false (where `df` can also be just a scalar). Returns false on nan. """ if is_frame(df): return ~df.fillna(True) return not pd.isna(df) and df is not None and not np.isnan( df) and not bool(df)
def cast(self, operand, rex=None) -> SeriesOrScalar: if not is_frame(operand): return operand output_type = str(rex.getType()) output_type = sql_to_python_type(output_type.upper()) return_column = cast_column_to_type(operand, output_type) if return_column is None: return operand else: return return_column
def case( self, where: Union[dd.Series, Any], then: Union[dd.Series, Any], other: Union[dd.Series, Any], ) -> Union[dd.Series, Any]: """ Returns `then` where `where`, else `other`. """ if is_frame(then): return then.where(where, other=other) elif is_frame(other): return other.where(~where, other=then) elif is_frame(where): # This one is a bit tricky. # Everything except "where" are scalars. # To make the "df.where" function still usable # we create a temporary dataframe with the # properties of where (but the content of then). tmp = where.apply(lambda x: then, meta=(where.name, type(then))) return tmp.where(where, other=other) else: return then if where else other
def substring(self, s, start, length=None): """Attention: SQL starts counting at 1""" if start <= 0: start = 0 else: start -= 1 end = length + start if length else None if is_frame(s): return s.str.slice(start, end) if end: return s[start:end] else: return s[start:]
def overlay(self, s, replace, start, length=None): """Attention: SQL starts counting at 1""" if start <= 0: start = 0 else: start -= 1 if length is None: length = len(replace) end = length + start if is_frame(s): return s.str.slice_replace(start, end, replace) s = s[:start] + replace + s[end:] return s
def datetime_sub(self, *operands, rex=None): output_type = str(rex.getType()) assert output_type.startswith("INTERVAL") interval_unit = output_type.split()[1].lower() subtraction_op = ReduceOperation( operation=operator.sub, unary_operation=lambda x: -x ) intermediate_res = subtraction_op(*operands) # Special case output_type for datetime operations if interval_unit in {"year", "quarter", "month"}: # if interval_unit is INTERVAL YEAR, Calcite will covert to months if not is_frame(intermediate_res): # Numpy doesn't allow divsion by month time unit result = intermediate_res.astype("timedelta64[M]") # numpy -ve timedelta's are off by one vs sql when casted to month result = result + 1 if result < 0 else result else: result = intermediate_res / np.timedelta64(1, "M") else: result = intermediate_res.astype("timedelta64[ms]") return result
def regex( self, test: SeriesOrScalar, regex: str, escape: str = None, ) -> SeriesOrScalar: """ Returns true, if the string test matches the given regex (maybe escaped by escape) """ if not escape: escape = "\\" # Unfortunately, SQL's like syntax is not directly # a regular expression. We need to do some translation # SQL knows about the following wildcards: # %, ?, [], _, # transformed_regex = "" escaped = False in_char_range = False for char in regex: # Escape characters with "\" if escaped: char = "\\" + char escaped = False # Keep character ranges [...] as they are elif in_char_range: if char == "]": in_char_range = False # These chars have a special meaning in regex # whereas in SQL they have not, so we need to # add additional escaping elif char in self.replacement_chars: char = "\\" + char elif char == "[": in_char_range = True # The needed "\" is printed above, so we continue elif char == escape: escaped = True continue # An unescaped "%" in SQL is a .* elif char == "%": char = ".*" # An unescaped "_" in SQL is a . elif char == "_": char = "." transformed_regex += char # the SQL like always goes over the full string transformed_regex = "^" + transformed_regex + "$" # Finally, apply the string if is_frame(test): return test.str.match(transformed_regex).astype("boolean") else: return bool(re.match(transformed_regex, test))
def test_is_frame_for_none(): assert not is_frame(None)
def test_is_frame_for_number(): assert not is_frame(3) assert not is_frame(3.5)
def apply(self, *operands): """Call the stored functions""" if is_frame(operands[0]): return self.tensor_f(*operands) return self.scalar_f(*operands)
def test_is_frame_for_frame(): df = dd.from_pandas(pd.DataFrame({"a": [1]}), npartitions=1) assert is_frame(df)