def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) safecheck = runner_conf.get( "spark.sql.execution.pandas.convertToArrowArraySafely", "false").lower() == 'true' # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType assign_cols_by_name = runner_conf.get( "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\ .lower() == "true" if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: ser = CogroupUDFSerializer(timezone, safecheck, assign_cols_by_name) else: # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of # pandas Series. See SPARK-27240. df_for_struct = ( eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF) ser = ArrowStreamPandasUDFSerializer(timezone, safecheck, assign_cols_by_name, df_for_struct) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF if is_scalar_iter or is_map_iter: if is_scalar_iter: assert num_udfs == 1, "One SCALAR_ITER UDF expected here." if is_map_iter: assert num_udfs == 1, "One MAP_ITER UDF expected here." arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) def func(_, iterator): num_input_rows = [ 0 ] # TODO(SPARK-29909): Use nonlocal after we drop Python 2. def map_batch(batch): udf_args = [batch[offset] for offset in arg_offsets] num_input_rows[0] += len(udf_args[0]) if len(udf_args) == 1: return udf_args[0] else: return tuple(udf_args) iterator = map(map_batch, iterator) result_iter = udf(iterator) num_output_rows = 0 for result_batch, result_type in result_iter: num_output_rows += len(result_batch) # This assert is for Scalar Iterator UDF to fail fast. # The length of the entire input can only be explicitly known # by consuming the input iterator in user side. Therefore, # it's very unlikely the output length is higher than # input length. assert is_map_iter or num_output_rows <= num_input_rows[0], \ "Pandas SCALAR_ITER UDF outputted more rows than input rows." yield (result_batch, result_type) if is_scalar_iter: try: next(iterator) except StopIteration: pass else: raise RuntimeError( "pandas iterator UDF should exhaust the input " "iterator.") if num_output_rows != num_input_rows[0]: raise RuntimeError( "The length of output in Scalar iterator pandas UDF should be " "the same with the input's; however, the length of output was %d and the " "length of input was %d." % (num_output_rows, num_input_rows[0])) # profiling is not supported for UDF return func, None, ser, ser def extract_key_value_indexes(grouped_arg_offsets): """ Helper function to extract the key and value indexes from arg_offsets for the grouped and cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code. :param grouped_arg_offsets: List containing the key and value indexes of columns of the DataFrames to be passed to the udf. It consists of n repeating groups where n is the number of DataFrames. Each group has the following format: group[0]: length of group group[1]: length of key indexes group[2.. group[1] +2]: key attributes group[group[1] +3 group[0]]: value attributes """ parsed = [] idx = 0 while idx < len(grouped_arg_offsets): offsets_len = grouped_arg_offsets[idx] idx += 1 offsets = grouped_arg_offsets[idx:idx + offsets_len] split_index = offsets[0] + 1 offset_keys = offsets[1:split_index] offset_values = offsets[split_index:] parsed.append([offset_keys, offset_values]) idx += offsets_len return parsed if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) parsed_offsets = extract_key_value_indexes(arg_offsets) # Create function like this: # mapper a: f([a[0]], [a[0], a[1]]) def mapper(a): keys = [a[o] for o in parsed_offsets[0][0]] vals = [a[o] for o in parsed_offsets[0][1]] return f(keys, vals) elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: # We assume there is only one UDF here because cogrouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) parsed_offsets = extract_key_value_indexes(arg_offsets) def mapper(a): df1_keys = [a[0][o] for o in parsed_offsets[0][0]] df1_vals = [a[0][o] for o in parsed_offsets[0][1]] df2_keys = [a[1][o] for o in parsed_offsets[1][0]] df2_vals = [a[1][o] for o in parsed_offsets[1][1]] return f(df1_keys, df1_vals, df2_keys, df2_vals) else: udfs = [] for i in range(num_udfs): udfs.append( read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i)) def mapper(a): result = tuple( f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. if len(result) == 1: return result[0] else: return result func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None, gateway=None): """ Create a new SparkContext. At least the master and app name should be set, either through the named parameters here or through C{conf}. @param master: Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). @param appName: A name for your job, to display on the cluster web UI. @param sparkHome: Location where Spark is installed on cluster nodes. @param pyFiles: Collection of .zip or .py files to send to the cluster and add to PYTHONPATH. These can be paths on the local file system or HDFS, HTTP, HTTPS, or FTP URLs. @param environment: A dictionary of environment variables to set on worker nodes. @param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching or -1 to use an unlimited batch size. @param serializer: The serializer for RDDs. @param conf: A L{SparkConf} object setting Spark properties. @param gateway: Use an existing gateway and JVM, otherwise a new JVM will be instatiated. >>> from pyspark.context import SparkContext >>> sc = SparkContext('local', 'test') >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... ValueError:... """ if rdd._extract_concise_traceback() is not None: self._callsite = rdd._extract_concise_traceback() else: tempNamedTuple = namedtuple("Callsite", "function file linenum") self._callsite = tempNamedTuple(function=None, file=None, linenum=None) SparkContext._ensure_initialized(self, gateway=gateway) self.environment = environment or {} self._conf = conf or SparkConf(_jvm=self._jvm) self._batchSize = batchSize # -1 represents an unlimited batch size self._unbatched_serializer = serializer if batchSize == 1: self.serializer = self._unbatched_serializer else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master) if appName: self._conf.setAppName(appName) if sparkHome: self._conf.setSparkHome(sparkHome) if environment: for key, value in environment.iteritems(): self._conf.setExecutorEnv(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): raise Exception("A master URL must be set in your configuration") if not self._conf.contains("spark.app.name"): raise Exception("An application name must be set in your configuration") # Read back our properties from the conf in case we loaded some of them from # the classpath or an external config file self.master = self._conf.get("spark.master") self.appName = self._conf.get("spark.app.name") self.sparkHome = self._conf.get("spark.home", None) for (k, v) in self._conf.getAll(): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v # Create the Java SparkContext through Py4J self._jsc = self._initialize_context(self._conf._jconf) # Create a single Accumulator in Java that we'll send all our updates through; # they will be passed back to us through a TCP server self._accumulatorServer = accumulators._start_update_server() (host, port) = self._accumulatorServer.server_address self._javaAccumulator = self._jsc.accumulator( self._jvm.java.util.ArrayList(), self._jvm.PythonAccumulatorParam(host, port)) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') # Broadcast's __reduce__ method stores Broadcast instances here. # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to # send. self._pickled_broadcast_vars = set() SparkFiles._sc = self root_dir = SparkFiles.getRootDirectory() sys.path.append(root_dir) # Deploy any code dependencies specified in the constructor self._python_includes = list() for path in (pyFiles or []): self.addPyFile(path) # Deploy code dependencies set by spark-submit; these will already have been added # with SparkContext.addFile, so we just need to add them to the PYTHONPATH for path in self._conf.get("spark.submit.pyFiles", "").split(","): if path != "": (dirname, filename) = os.path.split(path) self._python_includes.append(filename) sys.path.append(path) if not dirname in sys.path: sys.path.append(dirname) # Create a temporary directory inside spark.local.dir: local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf()) self._temp_dir = \ self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
def make_serializer(): return PickleSerializer()
except Exception: print("Something went wrong with the archive entry") print(list_error) return html_pages_array record_attribute = 'WARC-Record-ID' in_file = 'hdfs:///user/bbkruit/sample.warc.gz' stanford = '/home/wdps1813/scratch/wdps1813/wdps/stanford-ner-2017-06-09/' conf = SparkConf().setAppName("Entity Recognition") sc = SparkContext(conf = conf, serializer = PickleSerializer(), # Default serializer # Unlimited batch size -> BatchedSerializer instead of AutoBatchedSerializer batchSize = 1024) st = StanfordNERTagger(stanford + '/classifiers/english.all.3class.distsim.crf.ser.gz', stanford, '/stanford-ner.jar', encoding='utf-8') rdd_whole_warc_file = rdd = sc.newAPIHadoopFile(in_file, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf={"textinputformat.record.delimiter": "WARC/1.0"}) # Clean HTML pages rdd_html_cleaned = rdd_whole_warc_file.flatMap(lambda x: decode(x, record_attribute))
def _rdd(self): if self._lazy_rdd is None: jrdd = self._jdf.javaToPython() self._lazy_rdd = RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer())) return self._lazy_rdd
def test_als_ratings_id_long_error(self): ser = PickleSerializer() r = Rating(1205640308657491975, 50233468418, 1.0) # rating user id exceeds max int value, should fail when pickled self.assertRaises(Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads, bytearray(ser.dumps(r)))
else: from itertools import imap as map, ifilter as filter def wrap_function(func, profiler=None): def pickle_command(command): # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) return pickled_command ser = AutoBatchedSerializer(PickleSerializer()) command = (func, profiler, NoOpSerializer(), ser) pickled_command = pickle_command(command) return bytearray(pickled_command) def write_binary_file(path, func): with open(path, "wb") as f: f.write(wrap_function(func)) ser = AutoBatchedSerializer(PickleSerializer()) def udf(func): import mlsql p = mlsql.params() func_path = p["systemParam"]["funcPath"] write_binary_file(func_path, func)
def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) safecheck = runner_conf.get( "spark.sql.execution.pandas.arrowSafeTypeConversion", "false").lower() == 'true' # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType assign_cols_by_name = runner_conf.get( "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\ .lower() == "true" # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of # pandas Series. See SPARK-27240. df_for_struct = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF ser = ArrowStreamPandasUDFSerializer(timezone, safecheck, assign_cols_by_name, df_for_struct) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) udfs = {} call_udf = [] mapper_str = "" if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # Create function like this: # lambda a: f([a[0]], [a[0], a[1]]) # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) udfs['f'] = udf split_offset = arg_offsets[0] + 1 arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]] arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]] mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1)) else: # Create function like this: # lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3])) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. for i in range(num_udfs): arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
import gc from joblib import dump, load import numpy as np findspark.init() from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StructType, LongType, StringType, IntegerType import pyspark.sql.functions as F from pyspark import SparkContext, SparkConf from pyspark.serializers import PickleSerializer from tqdm import tqdm from itertools import permutations from collections import defaultdict import time # conf = SparkConf(serializer=PickleSerializer()) spark = SparkSession.builder.appName("user cf on spark").master("local[8]").config('serializer', PickleSerializer()).getOrCreate() sc = spark.sparkContext schema = StructType([StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', LongType(), True), StructField('timestamp', IntegerType(), True)]) ratings = spark.read.csv(r'D:\Users\hao.guo\比赛代码提炼\推荐系统\movielen\ml-20m\ratings.csv', header=True) ratings = ratings.withColumn('rating', ratings['rating'].cast('int')) ratings_rdd = ratings.select(['userId', 'movieId', 'rating']).rdd # ratings_rdd = ratings_rdd.sample(withReplacement=False, fraction=0.1, seed=2020) train_rdd, test_rdd = ratings_rdd.randomSplit([0.7, 0.3], seed=2020) train_rdd = train_rdd.cache()
def __init__(self, memory_limit, serializer=None): self.memory_limit = memory_limit self.local_dirs = _get_local_dirs("sort") self.serializer = serializer or BatchedSerializer( PickleSerializer(), 1024)
def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) safecheck = runner_conf.get( "spark.sql.execution.pandas.arrowSafeTypeConversion", "false").lower() == 'true' # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType assign_cols_by_name = runner_conf.get( "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\ .lower() == "true" # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of # pandas Series. See SPARK-27240. df_for_struct = ( eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF) ser = ArrowStreamPandasUDFSerializer(timezone, safecheck, assign_cols_by_name, df_for_struct) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF if is_scalar_iter or is_map_iter: assert num_udfs == 1, "One SCALAR_ITER UDF expected here." arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) def func(_, iterator): num_input_rows = [0] def map_batch(batch): udf_args = [batch[offset] for offset in arg_offsets] num_input_rows[0] += len(udf_args[0]) if len(udf_args) == 1: return udf_args[0] else: return tuple(udf_args) iterator = map(map_batch, iterator) result_iter = udf(iterator) num_output_rows = 0 for result_batch, result_type in result_iter: num_output_rows += len(result_batch) assert is_map_iter or num_output_rows <= num_input_rows[0], \ "Pandas SCALAR_ITER UDF outputted more rows than input rows." yield (result_batch, result_type) if is_scalar_iter: try: next(iterator) except StopIteration: pass else: raise RuntimeError( "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input " "iterator.") if is_scalar_iter and num_output_rows != num_input_rows[0]: raise RuntimeError( "The number of output rows of pandas iterator UDF should be " "the same with input rows. The input rows number is %d but the " "output rows number is %d." % (num_input_rows[0], num_output_rows)) # profiling is not supported for UDF return func, None, ser, ser udfs = {} call_udf = [] mapper_str = "" if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # Create function like this: # lambda a: f([a[0]], [a[0], a[1]]) # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0) udfs['f'] = udf split_offset = arg_offsets[0] + 1 arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]] arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]] mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1)) else: # Create function like this: # lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3])) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. for i in range(num_udfs): arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
def __init__(self, serializer=PickleSerializer()): FramedSerializer.__init__(self) assert isinstance( serializer, FramedSerializer), "serializer must be a FramedSerializer" self.serializer = serializer
def func(sc, *a, **kw): jrdd = f(sc, *a, **kw) return RDD(sc._jvm.SerDe.javaToPython(jrdd), sc, BatchedSerializer(PickleSerializer(), 1024))
def _compressed_serializer(self, serializer=None): # always use PickleSerializer to simplify implementation ser = PickleSerializer() return AutoBatchedSerializer(CompressedSerializer(ser))
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.random import RandomRDDs from pyspark.mllib.stat import Statistics from pyspark.serializers import PickleSerializer from pyspark.sql import SQLContext from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase _have_scipy = False try: import scipy.sparse _have_scipy = True except: # No SciPy, but that's okay, we'll skip those tests pass ser = PickleSerializer() def _squared_distance(a, b): if isinstance(a, Vector): return a.squared_distance(b) else: return b.squared_distance(a) class VectorTests(PySparkTestCase): def _test_serialize(self, v): self.assertEqual(v, ser.loads(ser.dumps(v))) jvec = self.sc._jvm.SerDe.loads(bytearray(ser.dumps(v))) nv = ser.loads(str(self.sc._jvm.SerDe.dumps(jvec))) self.assertEqual(v, nv)
def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder, messageHandler=None): """ .. note:: Experimental Create an input stream that directly pulls messages from a Kafka Broker and specific offset. This is not a receiver based Kafka input stream, it directly pulls the message from Kafka in each batch duration and processed without storing. This does not use Zookeeper to store offsets. The consumed offsets are tracked by the stream itself. For interoperability with Kafka monitoring tools that depend on Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. You can access the offsets used in each batch from the generated RDDs (see To recover from driver failures, you have to enable checkpointing in the StreamingContext. The information on consumed offset can be recovered from the checkpoint. See the programming guide for details (constraints, etc.). :param ssc: StreamingContext object. :param topics: list of topic_name to consume. :param kafkaParams: Additional params for Kafka. :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream. :param keyDecoder: A function used to decode key (default is utf8_decoder). :param valueDecoder: A function used to decode value (default is utf8_decoder). :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess meta using messageHandler (default is None). :return: A DStream object """ if fromOffsets is None: fromOffsets = dict() if not isinstance(topics, list): raise TypeError("topics should be list") if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") def funcWithoutMessageHandler(k_v): return (keyDecoder(k_v[0]), valueDecoder(k_v[1])) def funcWithMessageHandler(m): m._set_key_decoder(keyDecoder) m._set_value_decoder(valueDecoder) return messageHandler(m) helper = KafkaUtils._get_helper(ssc._sc) jfromOffsets = dict([(k._jTopicAndPartition(helper), v) for (k, v) in fromOffsets.items()]) if messageHandler is None: ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) func = funcWithoutMessageHandler jstream = helper.createDirectStreamWithoutMessageHandler( ssc._jssc, kafkaParams, set(topics), jfromOffsets) else: ser = AutoBatchedSerializer(PickleSerializer()) func = funcWithMessageHandler jstream = helper.createDirectStreamWithMessageHandler( ssc._jssc, kafkaParams, set(topics), jfromOffsets) stream = DStream(jstream, ssc, ser).map(func) return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
def readNonPartitionTable(self, project, table, numPartitions, cols=[], bytesCols=[], batchSize=1): jcols = self._to_java_array(cols) jbytesCols = self._to_java_array(bytesCols) jrdd = self._api.readTable(project, table, jcols, jbytesCols, batchSize, numPartitions) return RDD(jrdd, self._sc, PickleSerializer())
def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, gateway=None, jsc=None, profiler_cls=BasicProfiler): """ Create a new SparkContext. At least the master and app name should be set, either through the named parameters here or through C{conf}. 创建一个新的SparkContext。要么通过这里的参数名称进行配置,要么通过C{conf}配置, 不论哪种情况,至少需要设置master和应用程序名称。 :param master: Cluster URL to connect to(连接到的集群URL) (e.g. mesos://host:port, spark://host:port, local[4]). :param appName: A name for your job, to display on the cluster web UI. 作业的名称,显示在集群Web UI上 :param sparkHome: Location where Spark is installed on cluster nodes. Spark在群集节点上的安装位置 :param pyFiles: Collection of .zip or .py files to send to the cluster and add to PYTHONPATH. These can be paths on the local file system or HDFS, HTTP, HTTPS, or FTP URLs. 要发送到群集的.zip或.py文件的集合 并添加到PYTHONPATH。 这些路径可以是本地文件系统或HDFS,HTTP,HTTPS或FTP URL。 :param environment: A dictionary of environment variables to set on worker nodes. 要在工作节点上设置的环境变量字典。 :param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching, 0 to automatically choose the batch size based on object sizes, or -1 to use an unlimited batch size 表示为单个Java对象所需要的Python对象数量。设置1禁用批处理,0根据对象大小自动选择块大小,或-1不限制块大小 :param serializer: The serializer for RDDs. RDD的序列化对象 :param conf: A L{SparkConf} object setting Spark properties. 一个用于设置Spark属性的{SparkConf}对象 :param gateway: Use an existing gateway and JVM, otherwise a new JVM will be instantiated. 使用现有的网关和JVM,否则一个新的JVM将被实例化。 :param jsc: The JavaSparkContext instance (optional). JavaSparkContext实例(可选) :param profiler_cls: A class of custom Profiler used to do profiling (default is pyspark.profiler.BasicProfiler). 用于分析的一个自定义分析器(默认是pyspark.profiler.BasicProfiler)。 >>> from pyspark.context import SparkContext >>> sc = SparkContext('local', 'test') >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... ValueError:... """ self._callsite = first_spark_call() or CallSite(None, None, None)#检查异常信息 #检查SparkContext是否被初始化,如果SparkContext已经在运行,则会引发异常 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) try: self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls) except: # If an error occurs, clean up in order to allow future SparkContext creation: #如果出现了错误,为了以后的SparkContext创建可以创建需要做相关清理 self.stop() raise