def registerFunction(self, name, f, returnType=StringType()): """Registers a lambda function as a UDF so it can be used in SQL statements. In addition to a name and the function itself, the return type can be optionally specified. When the return type is not given it default to a string and conversion will automatically be done. For any other return type, the produced object must match the specified type. >>> sqlCtx.registerFunction("stringLengthString", lambda x: len(x)) >>> sqlCtx.sql("SELECT stringLengthString('test')").collect() [Row(c0=u'4')] >>> from pyspark.sql.types import IntegerType >>> sqlCtx.registerFunction("stringLengthInt", lambda x: len(x), IntegerType()) >>> sqlCtx.sql("SELECT stringLengthInt('test')").collect() [Row(c0=4)] """ func = lambda _, it: imap(lambda x: f(*x), it) ser = AutoBatchedSerializer(PickleSerializer()) command = (func, None, ser, ser) pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self._sc, command, self) self._ssql_ctx.udf().registerPython(name, bytearray(pickled_cmd), env, includes, self._sc.pythonExec, bvars, self._sc._javaAccumulator, returnType.json())
def _wrap_function(sc, func, returnType): command = (func, returnType) pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD( sc, command) return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec, sc.pythonVer, broadcast_vars, sc._javaAccumulator)
def registerFunction(self, name, f, returnType=StringType()): """Registers a lambda function as a UDF so it can be used in SQL statements. In addition to a name and the function itself, the return type can be optionally specified. When the return type is not given it default to a string and conversion will automatically be done. For any other return type, the produced object must match the specified type. :param name: name of the UDF :param samplingRatio: lambda function :param returnType: a :class:`DataType` object >>> sqlContext.registerFunction("stringLengthString", lambda x: len(x)) >>> sqlContext.sql("SELECT stringLengthString('test')").collect() [Row(c0=u'4')] >>> from pyspark.sql.types import IntegerType >>> sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType()) >>> sqlContext.sql("SELECT stringLengthInt('test')").collect() [Row(c0=4)] >>> from pyspark.sql.types import IntegerType >>> sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType()) >>> sqlContext.sql("SELECT stringLengthInt('test')").collect() [Row(c0=4)] """ func = lambda _, it: map(lambda x: f(*x), it) ser = AutoBatchedSerializer(PickleSerializer()) command = (func, None, ser, ser) pickled_cmd, bvars, env, includes = _prepare_for_python_RDD( self._sc, command, self) self._ssql_ctx.udf().registerPython(name, bytearray(pickled_cmd), env, includes, self._sc.pythonExec, self._sc.pythonVer, bvars, self._sc._javaAccumulator, returnType.json())
def _create_judf(self): f = self.func # put it in closure `func` func = lambda _, it: imap(lambda x: f(*x), it) ser = AutoBatchedSerializer(PickleSerializer()) command = (func, None, ser, ser) sc = SparkContext._active_spark_context pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self) ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc()) jdt = ssql_ctx.parseDataType(self.returnType.json()) judf = sc._jvm.UserDefinedPythonFunction(f.__name__, bytearray(pickled_command), env, includes, sc.pythonExec, broadcast_vars, sc._javaAccumulator, jdt) return judf
def _wrap_function(sc: SparkContext, func: Callable[..., Any], returnType: "DataTypeOrString") -> JavaObject: command = (func, returnType) pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD( sc, command) return sc._jvm.PythonFunction( # type: ignore[attr-defined] bytearray(pickled_command), env, includes, sc.pythonExec, # type: ignore[attr-defined] sc.pythonVer, # type: ignore[attr-defined] broadcast_vars, sc._javaAccumulator # type: ignore[attr-defined] )
def _create_judf(self): f = self.func # put it in closure `func` func = lambda _, it: map(lambda x: f(*x), it) ser = AutoBatchedSerializer(PickleSerializer()) command = (func, None, ser, ser) sc = SparkContext._active_spark_context pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self) ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc()) jdt = ssql_ctx.parseDataType(self.returnType.json()) fname = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__ judf = sc._jvm.UserDefinedPythonFunction(fname, bytearray(pickled_command), env, includes, sc.pythonExec, sc.pythonVer, broadcast_vars, sc._javaAccumulator, jdt) return judf
def _create_judf(self, name): f, returnType = self.func, self.returnType # put them in closure `func` func = lambda _, it: map(lambda x: returnType.toInternal(f(*x)), it) ser = AutoBatchedSerializer(PickleSerializer()) command = (func, None, ser, ser) sc = SparkContext._active_spark_context pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command, self) ssql_ctx = sc._jvm.SQLContext(sc._jsc.sc()) jdt = ssql_ctx.parseDataType(self.returnType.json()) if name is None: name = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__ judf = sc._jvm.UserDefinedPythonFunction(name, bytearray(pickled_command), env, includes, sc.pythonExec, sc.pythonVer, broadcast_vars, sc._javaAccumulator, jdt) return judf
def _wrap_function(sc: SparkContext, func: Callable[..., Any], returnType: "DataTypeOrString") -> JavaObject: command = (func, returnType) pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD( sc, command) assert sc._jvm is not None return sc._jvm.SimplePythonFunction( bytearray(pickled_command), env, includes, sc.pythonExec, sc.pythonVer, broadcast_vars, sc._javaAccumulator, )
def _wrap_function(self, sc, func, returnType): if sc.profiler_collector: profiler = sc.profiler_collector.new_profiler(sc) from pyspark.accumulators import _udf_dic if len(_udf_dic.keys()) == 0: key = 99999 else: key = max(_udf_dic.keys()) + 1 sc.profiler_collector.add_profiler(key, profiler) _udf_dic[key] = (SparkContext._next_accum_id - 1, self._name) else: profiler = None command = (func, returnType, profiler) pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD( sc, command) return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec, sc.pythonVer, broadcast_vars, sc._javaAccumulator)
def _wrap_function(sc, func, returnType): command = (func, returnType) pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command) return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec, sc.pythonVer, broadcast_vars, sc._javaAccumulator)