def test_schema_dstream(self): def rddList(start, end): return self.sc.parallelize(range( start, end)).map(lambda i: (i, "Text" + str(i))) def saveFunction(df): df.write.format("column").mode("append").saveAsTable( "streamingExample") schema = StructType([ StructField("loc", IntegerType()), StructField("text", StringType()) ]) snsc = SnappyStreamingContext(self.sc, 1) dstream = snsc.queueStream( [rddList(1, 10), rddList(10, 20), rddList(20, 30)]) snsc._snappycontext.dropTable("streamingExample", True) snsc._snappycontext.createTable("streamingExample", "column", schema) schemadstream = snsc.createSchemaDStream(dstream, schema) schemadstream.foreachDataFrame(lambda df: saveFunction(df)) snsc.start() time.sleep(1) snsc.sql("select count(*) from streamingExample").show()
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) self.setupCalled = True return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = SnappyStreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result)
def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = SnappyStreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_get_active(self): self.assertEqual(SnappyStreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(SnappyStreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = SnappyStreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(SnappyStreamingContext.getActive(), None)
def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(SnappyStreamingContext.getActive(), None) def setupFunc(): ssc = SnappyStreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual( SnappyStreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = SnappyStreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)
def setUp(self): self.ssc = SnappyStreamingContext(self.sc, self.duration)
def setupFunc(): ssc = SnappyStreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc