def emptyRDD(self): """ Create an RDD that has no partitions or elements. """ return RDD(self._jsc.emptyRDD(), self, NoOpSerializer())
def run_spark_job(queue: Queue, _agg_function: AggregationFunction, _agg_window_millis: int, _spark_opts: dict = {}, _environment: dict = {}): os.environ.update(_environment) try: try: import findspark findspark.init() except Exception as ex: self.logger.warn("Cannot import Spark pyspark with" " findspark. Message: {}".format(str(ex))) pass from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.sql.functions import expr, window from pyspark.serializers import NoOpSerializer from pyspark.streaming import DStream from pyspark.streaming.kafka import utf8_decoder spark_builder = SparkSession \ .builder \ for k in _spark_opts: spark_builder = spark_builder.config(k, _spark_opts[k]) spark_builder \ .appName(str(self)) \ .config("spark.jars.packages", "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1," "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \ .config("spark.jars", BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar") spark = spark_builder.getOrCreate() spark.sparkContext.setLogLevel("WARN") ssc = StreamingContext(spark.sparkContext, (agg_window_millis / 1000)) agg = expr("value") if _agg_function == AggregationFunction.AVG: agg = expr("avg(value)") elif _agg_function == AggregationFunction.SUM: agg = expr("sum(value)") elif _agg_function == AggregationFunction.COUNT: agg = expr("count(value)") elif _agg_function == AggregationFunction.P50: agg = expr("percentile(value, 0.5)") elif _agg_function == AggregationFunction.P75: agg = expr("percentile(value, 0.75)") elif _agg_function == AggregationFunction.P95: agg = expr("percentile(value, 0.95)") elif _agg_function == AggregationFunction.P99: agg = expr("percentile(value, 0.99)") deserializer = \ ssc._jvm.org.apache.spark.streaming.pubsub.SparkPubsubMessageSerializer() # noqa: E501 pubsub_utils = \ ssc._jvm.org.apache.spark.streaming.pubsub.PubsubUtils credentials = \ ssc._jvm.org.apache.spark.streaming.pubsub.SparkGCPCredentials storage_level = \ ssc._jvm.org.apache.spark.storage.StorageLevel _pubsub_stream = pubsub_utils \ .createStream(ssc._jssc, project_id, subscription, credentials.Builder().build(), storage_level.DISK_ONLY()) _pubsub_stream_des = _pubsub_stream.transform(deserializer) ser = NoOpSerializer() pubsub_stream = DStream(_pubsub_stream_des, ssc, ser).map(utf8_decoder) def aggregate_rdd(_queue, _agg, df, ts): secs = int(self.agg_window_millis / 1000) win = window("ts", "{} seconds".format(secs)) if df.first(): aggs = df \ .groupBy("application", win) \ .agg(_agg.alias("value")) \ .collect() for row in aggs: message = InputMessage(row["application"], value=row["value"], ts=ts) self.logger.debug("Enqueue: {}".format( message.to_json())) try: _queue.put(message.to_json()) except AssertionError as ex: self.logger.warn(str(ex)) else: self.logger.warn("Empty RDD") # Create kafka stream pubsub_stream \ .foreachRDD(lambda ts, rdd: aggregate_rdd(queue, agg, spark.read.json(rdd), ts)) # Run ssc.start() if "timeout" in _spark_opts: ssc.awaitTerminationOrTimeout(_spark_opts["timeout"]) ssc.stop() spark.stop() else: ssc.awaitTermination() ssc.stop() spark.stop() except Exception as e: raise e
def test_null_in_rdd(self): jrdd = self.sc._jvm.PythonUtils.generateRDDWithNull(self.sc._jsc) rdd = RDD(jrdd, self.sc, UTF8Deserializer()) self.assertEqual([u"a", None, u"b"], rdd.collect()) rdd = RDD(jrdd, self.sc, NoOpSerializer()) self.assertEqual([b"a", None, b"b"], rdd.collect())
def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder, messageHandler=None): """ .. note:: Experimental Create an input stream that directly pulls messages from a Kafka Broker and specific offset. This is not a receiver based Kafka input stream, it directly pulls the message from Kafka in each batch duration and processed without storing. This does not use Zookeeper to store offsets. The consumed offsets are tracked by the stream itself. For interoperability with Kafka monitoring tools that depend on Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. You can access the offsets used in each batch from the generated RDDs (see To recover from driver failures, you have to enable checkpointing in the StreamingContext. The information on consumed offset can be recovered from the checkpoint. See the programming guide for details (constraints, etc.). :param ssc: StreamingContext object. :param topics: list of topic_name to consume. :param kafkaParams: Additional params for Kafka. :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream. :param keyDecoder: A function used to decode key (default is utf8_decoder). :param valueDecoder: A function used to decode value (default is utf8_decoder). :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess meta using messageHandler (default is None). :return: A DStream object """ if fromOffsets is None: fromOffsets = dict() if not isinstance(topics, list): raise TypeError("topics should be list") if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") def funcWithoutMessageHandler(k_v): return (keyDecoder(k_v[0]), valueDecoder(k_v[1])) def funcWithMessageHandler(m): m._set_key_decoder(keyDecoder) m._set_value_decoder(valueDecoder) return messageHandler(m) helper = KafkaUtils._get_helper(ssc._sc) jfromOffsets = dict([(k._jTopicAndPartition(helper), v) for (k, v) in fromOffsets.items()]) if messageHandler is None: ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) func = funcWithoutMessageHandler jstream = helper.createDirectStreamWithoutMessageHandler( ssc._jssc, kafkaParams, set(topics), jfromOffsets) else: ser = AutoBatchedSerializer(PickleSerializer()) func = funcWithMessageHandler jstream = helper.createDirectStreamWithMessageHandler( ssc._jssc, kafkaParams, set(topics), jfromOffsets) stream = DStream(jstream, ssc, ser).map(func) return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
def createStream( ssc: StreamingContext, kinesisAppName: str, streamName: str, endpointUrl: str, regionName: str, initialPositionInStream: str, checkpointInterval: int, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2, awsAccessKeyId: Optional[str] = None, awsSecretKey: Optional[str] = None, decoder: Union[Callable[[Optional[bytes]], T], Callable[[Optional[bytes]], Optional[str]]] = utf8_decoder, stsAssumeRoleArn: Optional[str] = None, stsSessionName: Optional[str] = None, stsExternalId: Optional[str] = None, ) -> Union["DStream[Union[T, Optional[str]]]", "DStream[T]"]: """ Create an input stream that pulls messages from a Kinesis stream. This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. Parameters ---------- ssc : :class:`StreamingContext` StreamingContext object kinesisAppName : str Kinesis application name used by the Kinesis Client Library (KCL) to update DynamoDB streamName : str Kinesis stream name endpointUrl : str Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) regionName : str Name of region used by the Kinesis Client Library (KCL) to update DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) initialPositionInStream : int In the absence of Kinesis checkpoint info, this is the worker's initial starting position in the stream. The values are either the beginning of the stream per Kinesis' limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or the tip of the stream (InitialPositionInStream.LATEST). checkpointInterval : int Checkpoint interval(in seconds) for Kinesis checkpointing. See the Kinesis Spark Streaming documentation for more details on the different types of checkpoints. storageLevel : :class:`pyspark.StorageLevel`, optional Storage level to use for storing the received objects (default is StorageLevel.MEMORY_AND_DISK_2) awsAccessKeyId : str, optional AWS AccessKeyId (default is None. If None, will use DefaultAWSCredentialsProviderChain) awsSecretKey : str, optional AWS SecretKey (default is None. If None, will use DefaultAWSCredentialsProviderChain) decoder : function, optional A function used to decode value (default is utf8_decoder) stsAssumeRoleArn : str, optional ARN of IAM role to assume when using STS sessions to read from the Kinesis stream (default is None). stsSessionName : str, optional Name to uniquely identify STS sessions used to read from Kinesis stream, if STS is being used (default is None). stsExternalId : str, optional External ID that can be used to validate against the assumed IAM role's trust policy, if STS is being used (default is None). Returns ------- A DStream object Notes ----- The given AWS credentials will get saved in DStream checkpoints if checkpointing is enabled. Make sure that your checkpoint directory is secure. """ jlevel = ssc._sc._getJavaStorageLevel( storageLevel) # type: ignore[attr-defined] jduration = ssc._jduration( checkpointInterval) # type: ignore[attr-defined] try: helper = ( ssc._jvm.org.apache.spark.streaming.kinesis. KinesisUtilsPythonHelper() # type: ignore[attr-defined] ) except TypeError as e: if str(e) == "'JavaPackage' object is not callable": _print_missing_jar( "Streaming's Kinesis", "streaming-kinesis-asl", "streaming-kinesis-asl-assembly", ssc.sparkContext.version, ) raise jstream = helper.createStream( ssc._jssc, # type: ignore[attr-defined] kinesisAppName, streamName, endpointUrl, regionName, initialPositionInStream, jduration, jlevel, awsAccessKeyId, awsSecretKey, stsAssumeRoleArn, stsSessionName, stsExternalId, ) stream: DStream = DStream(jstream, ssc, NoOpSerializer()) return stream.map(lambda v: decoder(v))
def _toPythonDStream(ssc, jstream, bodyDecoder): ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) return stream
class KafkaUtils(object): @staticmethod def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={}, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ Create an input stream that pulls messages from a Kafka Broker. :param ssc: StreamingContext object :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). :param groupId: The group id for this consumer. :param topics: Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread. :param kafkaParams: Additional params for Kafka :param storageLevel: RDD storage level. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A DStream object """ kafkaParams.update({ "zookeeper.connect": zkQuorum, "group.id": groupId, "zookeeper.connection.timeout.ms": "10000", }) if not isinstance(topics, dict): raise TypeError("topics should be dict") jtopics = MapConverter().convert( topics, ssc.sparkContext._gateway._gateway_client) jparam = MapConverter().convert( kafkaParams, ssc.sparkContext._gateway._gateway_client) jlevel = ssc._sc._getJavaStorageLevel(storageLevel) try: # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027) helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() jstream = helper.createStream(ssc._jssc, jparam, jtopics, jlevel) except Py4JJavaError, e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): print """ ________________________________________________________________________________________________ Spark Streaming's Kafka libraries not found in class path. Try one of the following. 1. Include the Kafka library and its dependencies with in the spark-submit command as $ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka:%s ... 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s. Then, include the jar in the spark-submit command as $ bin/spark-submit --jars <spark-streaming-kafka-assembly.jar> ... ________________________________________________________________________________________________ """ % (ssc.sparkContext.version, ssc.sparkContext.version) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) return stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))
def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName, initialPositionInStream, checkpointInterval, storageLevel=StorageLevel.MEMORY_AND_DISK_2, awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder, stsAssumeRoleArn=None, stsSessionName=None, stsExternalId=None): """ Create an input stream that pulls messages from a Kinesis stream. This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing is enabled. Make sure that your checkpoint directory is secure. :param ssc: StreamingContext object :param kinesisAppName: Kinesis application name used by the Kinesis Client Library (KCL) to update DynamoDB :param streamName: Kinesis stream name :param endpointUrl: Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) :param regionName: Name of region used by the Kinesis Client Library (KCL) to update DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) :param initialPositionInStream: In the absence of Kinesis checkpoint info, this is the worker's initial starting position in the stream. The values are either the beginning of the stream per Kinesis' limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or the tip of the stream (InitialPositionInStream.LATEST). :param checkpointInterval: Checkpoint interval for Kinesis checkpointing. See the Kinesis Spark Streaming documentation for more details on the different types of checkpoints. :param storageLevel: Storage level to use for storing the received objects (default is StorageLevel.MEMORY_AND_DISK_2) :param awsAccessKeyId: AWS AccessKeyId (default is None. If None, will use DefaultAWSCredentialsProviderChain) :param awsSecretKey: AWS SecretKey (default is None. If None, will use DefaultAWSCredentialsProviderChain) :param decoder: A function used to decode value (default is utf8_decoder) :param stsAssumeRoleArn: ARN of IAM role to assume when using STS sessions to read from the Kinesis stream (default is None). :param stsSessionName: Name to uniquely identify STS sessions used to read from Kinesis stream, if STS is being used (default is None). :param stsExternalId: External ID that can be used to validate against the assumed IAM role's trust policy, if STS is being used (default is None). :return: A DStream object """ jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jduration = ssc._jduration(checkpointInterval) try: # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils helper = ssc._jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper( ) except TypeError as e: if str(e) == "'JavaPackage' object is not callable": _print_missing_jar("Streaming's Kinesis", "streaming-kinesis-asl", "streaming-kinesis-asl-assembly", ssc.sparkContext.version) raise jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl, regionName, initialPositionInStream, jduration, jlevel, awsAccessKeyId, awsSecretKey, stsAssumeRoleArn, stsSessionName, stsExternalId) stream = DStream(jstream, ssc, NoOpSerializer()) return stream.map(lambda v: decoder(v))
def createRDD(sc, kafkaParams, offsetRanges, leaders=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder, messageHandler=None): """ .. note:: Experimental Create a RDD from Kafka using offset ranges for each topic and partition. :param sc: SparkContext object :param kafkaParams: Additional params for Kafka :param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map, in which case leaders will be looked up on the driver. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess meta using messageHandler (default is None). :return: A RDD object """ if leaders is None: leaders = dict() if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") if not isinstance(offsetRanges, list): raise TypeError("offsetRanges should be list") def funcWithoutMessageHandler(k_v): return (keyDecoder(k_v[0]), valueDecoder(k_v[1])) def funcWithMessageHandler(m): m._set_key_decoder(keyDecoder) m._set_value_decoder(valueDecoder) return messageHandler(m) try: helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges] jleaders = dict([(k._jTopicAndPartition(helper), v._jBroker(helper)) for (k, v) in leaders.items()]) if messageHandler is None: jrdd = helper.createRDDWithoutMessageHandler( sc._jsc, kafkaParams, joffsetRanges, jleaders) ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) rdd = RDD(jrdd, sc, ser).map(funcWithoutMessageHandler) else: jrdd = helper.createRDDWithMessageHandler( sc._jsc, kafkaParams, joffsetRanges, jleaders) rdd = RDD(jrdd, sc).map(funcWithMessageHandler) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): KafkaUtils._printErrorMsg(sc) raise e return KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)