raise Exception("A Agg UDF with same name already exists %s" % udf.name) self.scalar_udfs[udf.name] = udf def __getitem__(self, name): if name in self.scalar_udfs: return self.scalar_udfs[name] if name in self.agg_udfs: return self.agg_udfs[name] raise Exception("Could not find UDF named %s" % name) # # Prepopulate registry with simple functions # registry = UDFRegistry.registry() registry.add(ScalarUDF("lower", 1, lambda col: compute.utf8_lower(col.cast(string())))) registry.add(ScalarUDF("upper", 1, lambda col: compute.utf8_upper(col.cast(string())))) # # Prepopulate with incremental aggregation functions # registry.add(AggUDF("count", 1, lambda col: compute.count(col).cast(float64()))) registry.add(AggUDF("avg", 1, lambda col: compute.mean(col).cast(float64()))) registry.add(AggUDF("sum", 1, lambda col: compute.sum(col).cast(float64()))) # Welford's algorithm for online std std_init = lambda: [0, 0., 0] def std_update(s, v): s[0] += 1 d = v - s[1]
def _str_lower(self): return type(self)(pc.utf8_lower(self._data))
def _expr_kernel(self, arguments: Any, table: ArrowTable) -> Any: return pc.utf8_lower(*arguments)