def wholeTextFiles(self, path, minPartitions=None, use_unicode=True): """ Read a directory of text files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. If use_unicode is False, the strings will be kept as `str` (encoding as `utf-8`), which is faster and smaller than unicode. (Added in Spark 1.2) For example, if you have the following files:: hdfs://a-hdfs-path/part-00000 hdfs://a-hdfs-path/part-00001 ... hdfs://a-hdfs-path/part-nnnnn Do C{rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")}, then C{rdd} contains:: (a-hdfs-path/part-00000, its content) (a-hdfs-path/part-00001, its content) ... (a-hdfs-path/part-nnnnn, its content) .. note:: Small files are preferred, as each file will be loaded fully in memory. >>> dirPath = os.path.join(tempdir, "files") >>> os.mkdir(dirPath) >>> with open(os.path.join(dirPath, "1.txt"), "w") as file1: ... _ = file1.write("1") >>> with open(os.path.join(dirPath, "2.txt"), "w") as file2: ... _ = file2.write("2") >>> textFiles = sc.wholeTextFiles(dirPath) >>> sorted(textFiles.collect()) [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')] """ minPartitions = minPartitions or self.defaultMinPartitions return RDD(self._jsc.wholeTextFiles(path, minPartitions), self, PairDeserializer(UTF8Deserializer(use_unicode), UTF8Deserializer(use_unicode)))
def createRDD(sc, kafkaParams, offsetRanges, leaders={}, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ .. note:: Experimental Create a RDD from Kafka using offset ranges for each topic and partition. :param sc: SparkContext object :param kafkaParams: Additional params for Kafka :param offsetRanges: list of offsetRange to specify topic:partition:[start, end) to consume :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges. May be an empty map, in which case leaders will be looked up on the driver. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A RDD object """ if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") if not isinstance(offsetRanges, list): raise TypeError("offsetRanges should be list") try: helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges] jleaders = dict([(k._jTopicAndPartition(helper), v._jBroker(helper)) for (k, v) in leaders.items()]) jrdd = helper.createRDD(sc._jsc, kafkaParams, joffsetRanges, jleaders) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): KafkaUtils._printErrorMsg(sc) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) rdd = RDD(jrdd, sc, ser) return rdd.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ Create an input stream that pulls messages from a Kafka Broker. :param ssc: StreamingContext object :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). :param groupId: The group id for this consumer. :param topics: Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread. :param kafkaParams: Additional params for Kafka :param storageLevel: RDD storage level. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A DStream object """ if kafkaParams is None: kafkaParams = dict() kafkaParams.update({ "zookeeper.connect": zkQuorum, "group.id": groupId, "zookeeper.connection.timeout.ms": "10000", }) if not isinstance(topics, dict): raise TypeError("topics should be dict") jlevel = ssc._sc._getJavaStorageLevel(storageLevel) try: # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027) helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel) except Py4JJavaError as e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): KafkaUtils._printErrorMsg(ssc.sparkContext) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
def createPairedStream(ssc, brokerUrl, topics, storageLevel=StorageLevel.MEMORY_AND_DISK_2): """ Create an input stream that pulls messages from a Mqtt Broker. :param ssc: StreamingContext object :param brokerUrl: Url of remote mqtt publisher :param topics: topic names to subscribe to :param storageLevel: RDD storage level. :return: A DStream object """ jlevel = ssc._sc._getJavaStorageLevel(storageLevel) helper = MQTTUtils._get_helper(ssc._sc) topics_array = MQTTUtils._list_to_java_string_array(ssc._sc, topics) jstream = helper.createPairedStream(ssc._jssc, brokerUrl, topics_array, jlevel) return DStream( jstream, ssc, PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None, storageLevel=StorageLevel.MEMORY_AND_DISK_2, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ Create an input stream that pulls messages from a Kafka Broker. :param ssc: StreamingContext object :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). :param groupId: The group id for this consumer. :param topics: Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread. :param kafkaParams: Additional params for Kafka :param storageLevel: RDD storage level. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A DStream object .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. See SPARK-21893. """ warnings.warn( "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. " "See SPARK-21893.", DeprecationWarning) if kafkaParams is None: kafkaParams = dict() kafkaParams.update({ "zookeeper.connect": zkQuorum, "group.id": groupId, "zookeeper.connection.timeout.ms": "10000", }) if not isinstance(topics, dict): raise TypeError("topics should be dict") jlevel = ssc._sc._getJavaStorageLevel(storageLevel) helper = KafkaUtils._get_helper(ssc._sc) jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel) ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
def wholeTextFiles(self, path): """ Read a directory of text files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. For example, if you have the following files:: hdfs://a-hdfs-path/part-00000 hdfs://a-hdfs-path/part-00001 ... hdfs://a-hdfs-path/part-nnnnn Do C{rdd = sparkContext.wholeTextFiles("hdfs://a-hdfs-path")}, then C{rdd} contains:: (a-hdfs-path/part-00000, its content) (a-hdfs-path/part-00001, its content) ... (a-hdfs-path/part-nnnnn, its content) NOTE: Small files are preferred, as each file will be loaded fully in memory. >>> dirPath = os.path.join(tempdir, "files") >>> os.mkdir(dirPath) >>> with open(os.path.join(dirPath, "1.txt"), "w") as file1: ... file1.write("1") >>> with open(os.path.join(dirPath, "2.txt"), "w") as file2: ... file2.write("2") >>> textFiles = sc.wholeTextFiles(dirPath) >>> sorted(textFiles.collect()) [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')] """ return RDD(self._jsc.wholeTextFiles(path), self, PairDeserializer(UTF8Deserializer(), UTF8Deserializer()))
def _toPythonDStream(ssc, jstream, bodyDecoder): ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) return stream
from pyspark.streaming import StreamingContext sc = SparkContext("local[2]", appName="jms py") ssc = StreamingContext(sc, 5) helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader( ).loadClass("com.redhat.spark.streaming.jms.JMSUtilsPythonHelper") helper = helperClass.newInstance() jbrokerURL = "amqp://127.0.0.1:5672" jqueuename = "default" jlevel = ssc._sc._getJavaStorageLevel(StorageLevel.MEMORY_AND_DISK_SER_2) jstream = helper.createStream(ssc._jssc, jbrokerURL, jqueuename, jlevel) ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) utf8_decoder = lambda s: s and s.decode('utf-8') keyDecoder = utf8_decoder valueDecoder = utf8_decoder a = stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v))) def process(rdd): print rdd.count() def protect(func): def _protect(rdd): if rdd.take(1): func(rdd)
def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder, messageHandler=None): """ .. note:: Experimental Create an input stream that directly pulls messages from a Kafka Broker and specific offset. This is not a receiver based Kafka input stream, it directly pulls the message from Kafka in each batch duration and processed without storing. This does not use Zookeeper to store offsets. The consumed offsets are tracked by the stream itself. For interoperability with Kafka monitoring tools that depend on Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. You can access the offsets used in each batch from the generated RDDs (see To recover from driver failures, you have to enable checkpointing in the StreamingContext. The information on consumed offset can be recovered from the checkpoint. See the programming guide for details (constraints, etc.). :param ssc: StreamingContext object. :param topics: list of topic_name to consume. :param kafkaParams: Additional params for Kafka. :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream. :param keyDecoder: A function used to decode key (default is utf8_decoder). :param valueDecoder: A function used to decode value (default is utf8_decoder). :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess meta using messageHandler (default is None). :return: A DStream object """ if fromOffsets is None: fromOffsets = dict() if not isinstance(topics, list): raise TypeError("topics should be list") if not isinstance(kafkaParams, dict): raise TypeError("kafkaParams should be dict") def funcWithoutMessageHandler(k_v): return (keyDecoder(k_v[0]), valueDecoder(k_v[1])) def funcWithMessageHandler(m): m._set_key_decoder(keyDecoder) m._set_value_decoder(valueDecoder) return messageHandler(m) helper = KafkaUtils._get_helper(ssc._sc) jfromOffsets = dict([(k._jTopicAndPartition(helper), v) for (k, v) in fromOffsets.items()]) if messageHandler is None: ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) func = funcWithoutMessageHandler jstream = helper.createDirectStreamWithoutMessageHandler( ssc._jssc, kafkaParams, set(topics), jfromOffsets) else: ser = AutoBatchedSerializer(PickleSerializer()) func = funcWithMessageHandler jstream = helper.createDirectStreamWithMessageHandler( ssc._jssc, kafkaParams, set(topics), jfromOffsets) stream = DStream(jstream, ssc, ser).map(func) return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
class KafkaUtils(object): @staticmethod def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={}, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2, keyDecoder=utf8_decoder, valueDecoder=utf8_decoder): """ Create an input stream that pulls messages from a Kafka Broker. :param ssc: StreamingContext object :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). :param groupId: The group id for this consumer. :param topics: Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread. :param kafkaParams: Additional params for Kafka :param storageLevel: RDD storage level. :param keyDecoder: A function used to decode key (default is utf8_decoder) :param valueDecoder: A function used to decode value (default is utf8_decoder) :return: A DStream object """ kafkaParams.update({ "zookeeper.connect": zkQuorum, "group.id": groupId, "zookeeper.connection.timeout.ms": "10000", }) if not isinstance(topics, dict): raise TypeError("topics should be dict") jtopics = MapConverter().convert( topics, ssc.sparkContext._gateway._gateway_client) jparam = MapConverter().convert( kafkaParams, ssc.sparkContext._gateway._gateway_client) jlevel = ssc._sc._getJavaStorageLevel(storageLevel) try: # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027) helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\ .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper") helper = helperClass.newInstance() jstream = helper.createStream(ssc._jssc, jparam, jtopics, jlevel) except Py4JJavaError, e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): print """ ________________________________________________________________________________________________ Spark Streaming's Kafka libraries not found in class path. Try one of the following. 1. Include the Kafka library and its dependencies with in the spark-submit command as $ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka:%s ... 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s. Then, include the jar in the spark-submit command as $ bin/spark-submit --jars <spark-streaming-kafka-assembly.jar> ... ________________________________________________________________________________________________ """ % (ssc.sparkContext.version, ssc.sparkContext.version) raise e ser = PairDeserializer(NoOpSerializer(), NoOpSerializer()) stream = DStream(jstream, ssc, ser) return stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))