def read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index): num_arg = read_int(infile) arg_offsets = [read_int(infile) for i in range(num_arg)] row_func = None for i in range(read_int(infile)): f, return_type = read_command(pickleSer, infile) if row_func is None: row_func = f else: row_func = chain(row_func, f) # make sure StopIteration's raised in the user code are not ignored # when they are processed in a for loop, raise them as RuntimeError's instead func = fail_on_stopiteration(row_func) # the last returnType will be the return type of UDF if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: return arg_offsets, wrap_scalar_pandas_udf(func, return_type) elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: argspec = _get_argspec(row_func) # signature was lost when wrapping it return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type, argspec, runner_conf) elif eval_type == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: return arg_offsets, wrap_grouped_agg_pandas_udf(func, return_type) elif eval_type == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF: return arg_offsets, wrap_window_agg_pandas_udf(func, return_type, runner_conf, udf_index) elif eval_type == PythonEvalType.SQL_BATCHED_UDF: return arg_offsets, wrap_udf(func, return_type) else: raise ValueError("Unknown eval type: {}".format(eval_type))
def _create_judf(self): from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext func = fail_on_stopiteration(self.func) wrapped_func = _wrap_function(sc, func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judf
def _create_judf(self): from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext func = fail_on_stopiteration(self.func) # for pandas UDFs the worker needs to know if the function takes # one or two arguments, but the signature is lost when wrapping with # fail_on_stopiteration, so we store it here if self.evalType in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF): func._argspec = _get_argspec(self.func) wrapped_func = _wrap_function(sc, func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judf
def read_single_udf(pickleSer, infile, eval_type): num_arg = read_int(infile) arg_offsets = [read_int(infile) for i in range(num_arg)] row_func = None for i in range(read_int(infile)): f, return_type = read_command(pickleSer, infile) if row_func is None: row_func = f else: row_func = chain(row_func, f) # make sure StopIteration's raised in the user code are not ignored # when they are processed in a for loop, raise them as RuntimeError's instead func = fail_on_stopiteration(row_func) # the last returnType will be the return type of UDF if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF: return arg_offsets, wrap_scalar_pandas_udf(func, return_type) elif eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: return arg_offsets, wrap_grouped_map_pandas_udf(func, return_type) else: return arg_offsets, wrap_udf(func, return_type)
def __init__(self, createCombiner, mergeValue, mergeCombiners): self.createCombiner = fail_on_stopiteration(createCombiner) self.mergeValue = fail_on_stopiteration(mergeValue) self.mergeCombiners = fail_on_stopiteration(mergeCombiners)
def _fail_on_stopiteration(fn): # noinspection PyPackageRequirements from pyspark import util return util.fail_on_stopiteration(fn)
def _func(_, iterator): return filter(fail_on_stopiteration(_fn), iterator)
def _func(_, iterator): return chain.from_iterable(map(fail_on_stopiteration(_fn), iterator))
def _func(_, iterator): return map(util.fail_on_stopiteration(_fn), iterator)