def test_stop_only_streaming_context(self): self.sc = SparkContext(master=self.master, appName=self.appName) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def test_stop_multiple_times(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop() self.ssc.stop()
def _writeAndVerify(self, ports): # Set up the streaming context and input streams ssc = StreamingContext(self.sc, self.duration) try: addresses = [("localhost", port) for port in ports] dstream = FlumeUtils.createPollingStream( ssc, addresses, maxBatchSize=self._utils.eventsPerBatch(), parallelism=5) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) dstream.foreachRDD(get_output) ssc.start() self._utils.sendDatAndEnsureAllDataHasBeenReceived() self.wait_for(outputBuffer, self._utils.getTotalEvents()) outputHeaders = [event[0] for event in outputBuffer] outputBodies = [event[1] for event in outputBuffer] self._utils.assertOutput(outputHeaders, outputBodies) finally: ssc.stop(False)
def test_from_conf_with_settings(self): conf = SparkConf() conf.set("spark.cleaner.ttl", "10") conf.setMaster(self.master) conf.setAppName(self.appName) self.ssc = StreamingContext(conf=conf, duration=self.batachDuration) self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
def main(argv): port = 9999 out_file = "myout2" port = int(argv[0]) out_file = argv[1] with open(out_file, "w") as fout: fout.close() conf = SparkConf().setMaster("local[*]") \ .setAppName("Flajolet-Martin") \ .set("spark.executor.memory", "4g") \ .set("spark.driver.memory", "4g") sc = SparkContext(conf=conf) sc.setLogLevel("OFF") ssc = StreamingContext(sc , BATCH_DURATION) stream = ssc.socketTextStream("localhost", port) \ .window(WINDOW_LENGTH, SLIDING_INTERVAL) \ .map(lambda x: json.loads(x)) hashParams = hashFuncs() with open(out_file, 'a') as fout: output = csv.writer(fout) output.writerow(["Time", "Ground Truth", "Estimation"]) fout.close() stream.map(lambda x: x["city"]).filter(lambda x: x != "") \ .foreachRDD(lambda rdd: Flajolet_Martin(rdd, hashParams, out_file)) ssc.start() ssc.awaitTermination()
def test_from_no_conf_constructor(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, duration=self.batachDuration) # Alternative call master: ssc.sparkContext.master # I try to make code close to Scala. self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master) self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream( [self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))
def setUp(self): super(DeleteFromCassandraStreamingTest, self).setUp() self.ssc = StreamingContext(self.sc, self.interval) self.rdds = [ self.sc.parallelize(range(0, self.size)).map(lambda i: { 'key': i, 'int': i, 'text': i }) ] data = self.rdds[0] data.saveToCassandra(self.keyspace, self.table) # verify the RDD length and actual content data = self.rdd() self.assertEqual(len(data.collect()), self.size) # verify we have actually data for `text` and `int` row = data.select('text', 'int').where('key=?', '0').first() self.assertEqual(row.text, u'0') self.assertEqual(row.int, 0) # stream we will use in tests. self.stream = self.ssc.queueStream(self.rdds)
def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration)
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) return ssc
def _create_spark_context(self, spark_config, stream, stream_duration): if stream is True: self.streaming_context = StreamingContext( SparkContext(conf=pyspark.SparkConf().setAll( spark_config.items())).getOrCreate(), stream_duration) self.spark = SparkSession(self.streaming_context.sparkContext) else: self.spark = SparkSession.builder \ .config(conf=pyspark.SparkConf().setAll(spark_config.items())) \ .enableHiveSupport().getOrCreate()
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result)
def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def createSSC(): # ssc 생성 conf = SparkConf() sc = SparkContext(master="local[*]", appName="CheckpointSample", conf=conf) ssc = StreamingContext(sc, 3) # DStream 생성 ids1 = ssc.socketTextStream("127.0.0.1", 9000) ids2 = ids1.flatMap(lambda v: v.split(" ")).map(lambda v: (v, 1)) # updateStateByKey ids2.updateStateByKey(updateFunc).pprint() # checkpoint ssc.checkpoint("./checkPoints/checkPointSample/Python") # return return ssc
def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None)
def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)
def setUp(self): self.ssc = StreamingContext(self.sc, self.duration)
def setUp(self): class_name = self.__class__.__name__ self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
from pyspark.sql import SparkSession from pyspark.streaming.context import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate() ssc = StreamingContext(spark.sparkContext, 10) lines = KinesisUtils.createStream( ssc, "test", "test_s", "https://kinesis.eu-north-1.amazonaws.com", "eu-north-1", InitialPositionInStream.LATEST, awsAccessKeyId="AKIAJ5V6NEAI3YNTWGDA", awsSecretKey="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", checkpointInterval=2)
import sys from pyspark.streaming.context import StreamingContext from pyspark.streaming.duration import * if __name__ == "__main__": if len(sys.argv) != 3: print >> sys.stderr, "Usage: wordcount <hostname> <port>" exit(-1) ssc = StreamingContext(appName="PythonStreamingNetworkWordCount", duration=Seconds(1)) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .reduceByKey(lambda a,b: a+b) counts.pyprint() ssc.start() ssc.awaitTermination()
def test_from_no_conf_plus_spark_home_plus_env(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, sparkHome=self.sparkHome, environment=self.envPair, duration=self.batachDuration) self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])
def test_from_existing_spark_context(self): self.sc = SparkContext(master=self.master, appName=self.appName) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
# 6.2.3절 from pyspark import SparkContext, SparkConf from pyspark.streaming.context import StreamingContext conf = SparkConf() sc = SparkContext(master="local[*]", appName="QueueSample", conf=conf) ssc = StreamingContext(sc, 3) rdd1 = sc.parallelize(["a", "b", "c"]) rdd2 = sc.parallelize(["c", "d", "e"]) queue = [rdd1, rdd2] ds = ssc.queueStream(queue) ds.pprint() ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.streaming.context import StreamingContext sc = SparkContext() ssc = StreamingContext( sc, 1 ) # specify batchDuration, in this case, all data accumulated within 1 second window is going into one batch rddData = [sc.parallelize([i]) for i in range(1000)] # stream is a sequence of RDDs inSteram = ssc.queueStream(rddData) filter1 = inStream.filter(lambda x: x % 2 == 0) filter2 = filter1.filter(lambda x: x % 3 == 0) collected = filter2 collected.pprint() ssc.start() # triggers stream to run until ssc.stop() is run # > 0 # > # > # > # > # > # > 6 # > # > # > # > ... any multiples of 6... ssc.stop()
def test_existing_spark_context_with_settings(self): conf = SparkConf() conf.set("spark.cleaner.ttl", "10") self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc
def test_from_no_conf_plus_spark_home(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, sparkHome=self.sparkHome, duration=self.batachDuration) self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
def __init__(self, sc, spark): self.sc = sc self.spark = spark self.ssc = StreamingContext(self.sc, 1)
def setUpClass(cls): super(StreamingTest, cls).setUpClass() cls.ssc = StreamingContext(cls.sc, cls.interval)