def main(argv): port = 9999 out_file = "myout2" port = int(argv[0]) out_file = argv[1] with open(out_file, "w") as fout: fout.close() conf = SparkConf().setMaster("local[*]") \ .setAppName("Flajolet-Martin") \ .set("spark.executor.memory", "4g") \ .set("spark.driver.memory", "4g") sc = SparkContext(conf=conf) sc.setLogLevel("OFF") ssc = StreamingContext(sc , BATCH_DURATION) stream = ssc.socketTextStream("localhost", port) \ .window(WINDOW_LENGTH, SLIDING_INTERVAL) \ .map(lambda x: json.loads(x)) hashParams = hashFuncs() with open(out_file, 'a') as fout: output = csv.writer(fout) output.writerow(["Time", "Ground Truth", "Estimation"]) fout.close() stream.map(lambda x: x["city"]).filter(lambda x: x != "") \ .foreachRDD(lambda rdd: Flajolet_Martin(rdd, hashParams, out_file)) ssc.start() ssc.awaitTermination()
# 6.2.4절 예제 6-12 from pyspark import SparkContext, SparkConf, storagelevel from pyspark.streaming.context import StreamingContext from pyspark.streaming.kafka import KafkaUtils ## pyspark에서 실행할 경우 sparkContext는 생성하지 않습니다! # ./pyspark --packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.2 conf = SparkConf() sc = SparkContext(master="local[*]", appName="KafkaSample", conf=conf) ssc = StreamingContext(sc, 3) ds1 = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group1", {"test": 3}) ds2 = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": "localhost:9092"}) ds1.pprint() ds2.pprint() ssc.start() ssc.awaitTermination()
# 6.2.3절 from pyspark import SparkContext, SparkConf from pyspark.streaming.context import StreamingContext conf = SparkConf() sc = SparkContext(master="local[*]", appName="QueueSample", conf=conf) ssc = StreamingContext(sc, 3) rdd1 = sc.parallelize(["a", "b", "c"]) rdd2 = sc.parallelize(["c", "d", "e"]) queue = [rdd1, rdd2] ds = ssc.queueStream(queue) ds.pprint() ssc.start() ssc.awaitTermination()
class DeleteFromCassandraStreamingTest(SimpleTypesTestBase): size = 10 interval = .1 def setUp(self): super(DeleteFromCassandraStreamingTest, self).setUp() self.ssc = StreamingContext(self.sc, self.interval) self.rdds = [ self.sc.parallelize(range(0, self.size)).map(lambda i: { 'key': i, 'int': i, 'text': i }) ] data = self.rdds[0] data.saveToCassandra(self.keyspace, self.table) # verify the RDD length and actual content data = self.rdd() self.assertEqual(len(data.collect()), self.size) # verify we have actually data for `text` and `int` row = data.select('text', 'int').where('key=?', '0').first() self.assertEqual(row.text, u'0') self.assertEqual(row.int, 0) # stream we will use in tests. self.stream = self.ssc.queueStream(self.rdds) def test_delete_single_column(self): self.stream \ .deleteFromCassandra(self.keyspace, self.table, deleteColumns=['text']) self.ssc.start() self.ssc.awaitTermination((self.size + 1) * self.interval) self.ssc.stop(stopSparkContext=False, stopGraceFully=True) data = self.rdd() self.assertEqual(len(data.collect()), self.size) # verify we have actually data for `text` and `int` row = data.select('text', 'int').where('key=?', '0').first() self.assertEqual(row.int, 0) self.assertIsNone(row.text) def test_delete_2_columns(self): self.stream \ .deleteFromCassandra(self.keyspace, self.table, deleteColumns=['text', 'int']) self.ssc.start() self.ssc.awaitTermination((self.size + 1) * self.interval) self.ssc.stop(stopSparkContext=False, stopGraceFully=True) data = self.rdd() self.assertEqual(len(data.collect()), self.size) # verify we have actually data for `text` and `int` row = data.select('text', 'int').where('key=?', '0').first() self.assertIsNone(row.int) self.assertIsNone(row.text) def test_delete_all_rows_default(self): self.stream \ .deleteFromCassandra(self.keyspace, self.table) self.ssc.start() self.ssc.awaitTermination((self.size + 1) * self.interval) self.ssc.stop(stopSparkContext=False, stopGraceFully=True) data = self.rdd() self.assertEqual(len(data.collect()), 0) def test_delete_all_rows_explicit(self): self.stream \ .deleteFromCassandra(self.keyspace, self.table, keyColumns=['key']) self.ssc.start() self.ssc.awaitTermination((self.size + 1) * self.interval) self.ssc.stop(stopSparkContext=False, stopGraceFully=True) data = self.rdd() self.assertEqual(len(data.collect()), 0)