def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)
def my_main(source_dir, monitoring_dir, checkpoint_dir, result_dir, max_micro_batches, time_step_interval, verbose, percentage_f, window_duration, sliding_duration, race_conditions_extra_delay ): # 1. We setup the Spark Streaming context # This sets up the computation that will be done when the system receives data. ssc = StreamingContext.getActiveOrCreate(checkpoint_dir, lambda: create_ssc(monitoring_dir, result_dir, max_micro_batches, time_step_interval, percentage_f, window_duration, sliding_duration ) ) # 2. We start the Spark Streaming Context in the background to start receiving data. # Spark Streaming will start scheduling Spark jobs in a separate thread. # Very important: Please note a Streaming context can be started only once. # Moreover, it must be started only once we have fully specified what do we want it to do # when it receives data (i.e., the full set of transformations and ouptut operations we want it # to perform). ssc.start() # 3. As the jobs are done in a separate thread, to keep our application (this thread) from exiting, # we need to call awaitTermination to wait for the streaming computation to finish. ssc.awaitTerminationOrTimeout(time_step_interval) if (race_conditions_extra_delay == True): time.sleep((sliding_duration - 1) * time_step_interval) # 4. We simulate the streaming arrival of files (i.e., one by one) from source_dir to monitoring_dir. streaming_simulation(source_dir, monitoring_dir, time_step_interval, verbose) # 5. Once we have transferred all files and processed them, we are done. # Thus, we stop the Spark Streaming Context ssc.stop(stopSparkContext=False) # 6. Extra security stop command: It acts directly over the Java Virtual Machine, # in case the Spark Streaming context was not fully stopped. # This is crucial to avoid a Spark application working on the background. # For example, Databricks, on its private version, charges per cluster nodes (virtual machines) # and hours of computation. If we, unintentionally, leave a Spark application working, we can # end up with an unexpected high bill. if (not sc._jvm.StreamingContext.getActive().isEmpty()): sc._jvm.StreamingContext.getActive().get().stop(False)
def my_main(source_dir, monitoring_dir, checkpoint_dir, result_dir, max_micro_batches, time_step_interval, verbose, percentage_f): ssc = StreamingContext.getActiveOrCreate( checkpoint_dir, lambda: create_ssc(monitoring_dir, result_dir, max_micro_batches, time_step_interval, percentage_f)) ssc.start() ssc.awaitTerminationOrTimeout(time_step_interval) streaming_simulation(source_dir, monitoring_dir, time_step_interval, verbose) ssc.stop(stopSparkContext=False) if (not sc._jvm.StreamingContext.getActive().isEmpty()): sc._jvm.StreamingContext.getActive().get().stop(False)
def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(2) self.setupCalled = True return ssc # Verify that getOrCreate() calls setup() in absence of checkpoint files self.cpd = tempfile.mkdtemp("test_streaming_cps") self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) self.ssc.start() def check_output(n): while not os.listdir(outputd): if self.ssc.awaitTerminationOrTimeout(0.5): raise Exception("ssc stopped") time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), 'w') as f: f.writelines(["%d\n" % i for i in range(10)]) while True: if self.ssc.awaitTerminationOrTimeout(0.5): raise Exception("ssc stopped") p = os.path.join(outputd, max(os.listdir(outputd))) if '_SUCCESS' not in os.listdir(p): # not finished continue ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) # Verify the getOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(3) # Verify that getOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify the getActiveOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(4) # Verify that getActiveOrCreate() returns active context self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files self.ssc.stop(True, True) shutil.rmtree(self.cpd) # delete checkpoint directory time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) # Stop everything self.ssc.stop(True, True)
#paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically kinesisStream.foreachRDD(processKinesisPax) except Exception as e: LogToKinesis("mainLoop", "EXCEPTION", str(e)) ssc.checkpoint(CHECKPOINTDIR) return ssc # COMMAND ---------- # # Start streaming # try: # ssc = StreamingContext.getActiveOrCreate(CHECKPOINTDIR, creatingfunc) # ssc.start() # ssc.awaitTerminationOrTimeout(2*batchIntervalSeconds) # except Exception as e: # LogToKinesis("MAIN", "EXCEPTION", str(e)) # COMMAND ---------- #dbutils.fs.ls(CHECKPOINTDIR) # COMMAND ---------- ssc = StreamingContext.getActiveOrCreate(CHECKPOINTDIR, creatingfunc) ssc.start() ssc.awaitTerminationOrTimeout(2 * batchIntervalSeconds) # COMMAND ----------
#paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically kinesisStream.foreachRDD(processKinesisPax) except Exception as e: LogToKinesis("mainLoop", "EXCEPTION", str(e)) ssc.checkpoint(CHECKPOINTDIR) return ssc # COMMAND ---------- # # Start streaming # try: # ssc = StreamingContext.getActiveOrCreate(CHECKPOINTDIR, creatingfunc) # ssc.start() # ssc.awaitTerminationOrTimeout(2*batchIntervalSeconds) # except Exception as e: # LogToKinesis("MAIN", "EXCEPTION", str(e)) # COMMAND ---------- #dbutils.fs.ls(CHECKPOINTDIR) # COMMAND ---------- ssc = StreamingContext.getActiveOrCreate(CHECKPOINTDIR, creatingfunc) ssc.start() ssc.awaitTerminationOrTimeout(2*batchIntervalSeconds) # COMMAND ----------