示例#1
0
 def test_stop_only_streaming_context(self):
     self.sc = SparkContext(master=self.master, appName=self.appName)
     self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
     self._addInputStream(self.ssc)
     self.ssc.start()
     self.ssc.stop(False)
     self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
示例#2
0
 def test_stop_multiple_times(self):
     self.ssc = StreamingContext(master=self.master, appName=self.appName,
                            duration=self.batachDuration)
     self._addInputStream(self.ssc)
     self.ssc.start()
     self.ssc.stop()
     self.ssc.stop()
示例#3
0
文件: tests.py 项目: zyjibmcn/spark
    def _writeAndVerify(self, ports):
        # Set up the streaming context and input streams
        ssc = StreamingContext(self.sc, self.duration)
        try:
            addresses = [("localhost", port) for port in ports]
            dstream = FlumeUtils.createPollingStream(
                ssc,
                addresses,
                maxBatchSize=self._utils.eventsPerBatch(),
                parallelism=5)
            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            dstream.foreachRDD(get_output)
            ssc.start()
            self._utils.sendDatAndEnsureAllDataHasBeenReceived()

            self.wait_for(outputBuffer, self._utils.getTotalEvents())
            outputHeaders = [event[0] for event in outputBuffer]
            outputBodies = [event[1] for event in outputBuffer]
            self._utils.assertOutput(outputHeaders, outputBodies)
        finally:
            ssc.stop(False)
示例#4
0
 def test_from_conf_with_settings(self):
     conf = SparkConf()
     conf.set("spark.cleaner.ttl", "10")
     conf.setMaster(self.master)
     conf.setAppName(self.appName)
     self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
     self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
示例#5
0
def main(argv):
    port = 9999
    out_file = "myout2"

    port = int(argv[0])
    out_file = argv[1]

    with open(out_file, "w") as fout:
        fout.close()

    conf = SparkConf().setMaster("local[*]") \
        .setAppName("Flajolet-Martin") \
        .set("spark.executor.memory", "4g") \
        .set("spark.driver.memory", "4g")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("OFF")

    ssc = StreamingContext(sc , BATCH_DURATION)
    stream = ssc.socketTextStream("localhost", port) \
        .window(WINDOW_LENGTH, SLIDING_INTERVAL) \
        .map(lambda x: json.loads(x))

    hashParams = hashFuncs()

    with open(out_file, 'a') as fout: 
        output = csv.writer(fout)
        output.writerow(["Time", "Ground Truth", "Estimation"])
        fout.close()

    stream.map(lambda x: x["city"]).filter(lambda x: x != "") \
        .foreachRDD(lambda rdd: Flajolet_Martin(rdd, hashParams, out_file))

    ssc.start()
    ssc.awaitTermination()
示例#6
0
 def test_from_no_conf_constructor(self):
     self.ssc = StreamingContext(master=self.master, appName=self.appName,
                            duration=self.batachDuration)
     # Alternative call master: ssc.sparkContext.master
     # I try to make code close to Scala.
     self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
     self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
示例#7
0
    def test_slice(self):
        """Basic operation test for DStream.slice."""
        import datetime as dt
        self.ssc = StreamingContext(self.sc, 1.0)
        self.ssc.remember(4.0)
        input = [[1], [2], [3], [4]]
        stream = self.ssc.queueStream(
            [self.sc.parallelize(d, 1) for d in input])

        time_vals = []

        def get_times(t, rdd):
            if rdd and len(time_vals) < len(input):
                time_vals.append(t)

        stream.foreachRDD(get_times)

        self.ssc.start()
        self.wait_for(time_vals, 4)
        begin_time = time_vals[0]

        def get_sliced(begin_delta, end_delta):
            begin = begin_time + dt.timedelta(seconds=begin_delta)
            end = begin_time + dt.timedelta(seconds=end_delta)
            rdds = stream.slice(begin, end)
            result_list = [rdd.collect() for rdd in rdds]
            return [r for result in result_list for r in result]

        self.assertEqual(set([1]), set(get_sliced(0, 0)))
        self.assertEqual(set([2, 3]), set(get_sliced(1, 2)))
        self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4)))
        self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))
示例#8
0
    def setUp(self):
        super(DeleteFromCassandraStreamingTest, self).setUp()
        self.ssc = StreamingContext(self.sc, self.interval)

        self.rdds = [
            self.sc.parallelize(range(0, self.size)).map(lambda i: {
                'key': i,
                'int': i,
                'text': i
            })
        ]
        data = self.rdds[0]
        data.saveToCassandra(self.keyspace, self.table)

        # verify the RDD length and actual content
        data = self.rdd()
        self.assertEqual(len(data.collect()), self.size)

        # verify we have actually data for `text` and `int`
        row = data.select('text', 'int').where('key=?', '0').first()
        self.assertEqual(row.text, u'0')
        self.assertEqual(row.int, 0)

        # stream we will use in tests.
        self.stream = self.ssc.queueStream(self.rdds)
示例#9
0
 def setUp(self):
     class_name = self.__class__.__name__
     conf = SparkConf().set("spark.default.parallelism", 1)
     self.sc = SparkContext(appName=class_name, conf=conf)
     self.sc.setCheckpointDir("/tmp")
     # TODO: decrease duration to speed up tests
     self.ssc = StreamingContext(self.sc, self.duration)
示例#10
0
 def setup():
     conf = SparkConf().set("spark.default.parallelism", 1)
     sc = SparkContext(conf=conf)
     ssc = StreamingContext(sc, 0.5)
     dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
     wc = dstream.updateStateByKey(updater)
     wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
     wc.checkpoint(.5)
     return ssc
示例#11
0
 def _create_spark_context(self, spark_config, stream, stream_duration):
     if stream is True:
         self.streaming_context = StreamingContext(
             SparkContext(conf=pyspark.SparkConf().setAll(
                 spark_config.items())).getOrCreate(), stream_duration)
         self.spark = SparkSession(self.streaming_context.sparkContext)
     else:
         self.spark = SparkSession.builder \
          .config(conf=pyspark.SparkConf().setAll(spark_config.items())) \
          .enableHiveSupport().getOrCreate()
示例#12
0
        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 0.5)

            # A function that cannot be serialized
            def process(time, rdd):
                sc.parallelize(range(1, 10))

            ssc.textFileStream(inputd).foreachRDD(process)
            return ssc
示例#13
0
 def test_text_file_stream(self):
     d = tempfile.mkdtemp()
     self.ssc = StreamingContext(self.sc, self.duration)
     dstream2 = self.ssc.textFileStream(d).map(int)
     result = self._collect(dstream2, 2, block=False)
     self.ssc.start()
     for name in ('a', 'b'):
         time.sleep(1)
         with open(os.path.join(d, name), "w") as f:
             f.writelines(["%d\n" % i for i in range(10)])
     self.wait_for(result, 2)
     self.assertEqual([list(range(10)), list(range(10))], result)
示例#14
0
文件: tests.py 项目: zyjibmcn/spark
 def test_binary_records_stream(self):
     d = tempfile.mkdtemp()
     self.ssc = StreamingContext(self.sc, self.duration)
     dstream = self.ssc.binaryRecordsStream(d, 10).map(
         lambda v: struct.unpack("10b", bytes(v)))
     result = self._collect(dstream, 2, block=False)
     self.ssc.start()
     for name in ('a', 'b'):
         time.sleep(1)
         with open(os.path.join(d, name), "wb") as f:
             f.write(bytearray(range(10)))
     self.wait_for(result, 2)
     self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
示例#15
0
def createSSC():
    # ssc 생성
    conf = SparkConf()
    sc = SparkContext(master="local[*]", appName="CheckpointSample", conf=conf)
    ssc = StreamingContext(sc, 3)

    # DStream 생성
    ids1 = ssc.socketTextStream("127.0.0.1", 9000)
    ids2 = ids1.flatMap(lambda v: v.split(" ")).map(lambda v: (v, 1))

    # updateStateByKey
    ids2.updateStateByKey(updateFunc).pprint()

    # checkpoint
    ssc.checkpoint("./checkPoints/checkPointSample/Python")

    # return
    return ssc
示例#16
0
    def test_get_active(self):
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)
示例#17
0
    def test_get_active_or_create(self):
        # Test StreamingContext.getActiveOrCreate() without checkpoint data
        # See CheckpointTests for tests with checkpoint data
        self.ssc = None
        self.assertEqual(StreamingContext.getActive(), None)

        def setupFunc():
            ssc = StreamingContext(self.sc, self.duration)
            ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
            self.setupCalled = True
            return ssc

        # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that getActiveOrCreate() returns active context and does not call the setupFunc
        self.ssc.start()
        self.setupCalled = False
        self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc),
                         self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() calls setupFunc after active context is stopped
        self.ssc.stop(False)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)
示例#18
0
 def setUp(self):
     self.ssc = StreamingContext(self.sc, self.duration)
示例#19
0
 def setUp(self):
     class_name = self.__class__.__name__
     self.ssc = StreamingContext(appName=class_name, duration=Seconds(1))
示例#20
0
from pyspark.sql import SparkSession
from pyspark.streaming.context import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream

spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 10)

lines = KinesisUtils.createStream(
    ssc,
    "test",
    "test_s",
    "https://kinesis.eu-north-1.amazonaws.com",
    "eu-north-1",
    InitialPositionInStream.LATEST,
    awsAccessKeyId="AKIAJ5V6NEAI3YNTWGDA",
    awsSecretKey="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",
    checkpointInterval=2)
示例#21
0
import sys

from pyspark.streaming.context import StreamingContext
from pyspark.streaming.duration import *

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: wordcount <hostname> <port>"
        exit(-1)
    ssc = StreamingContext(appName="PythonStreamingNetworkWordCount",
                           duration=Seconds(1))

    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
    counts = lines.flatMap(lambda line: line.split(" "))\
                  .map(lambda word: (word, 1))\
                  .reduceByKey(lambda a,b: a+b)
    counts.pyprint()

    ssc.start()
    ssc.awaitTermination()
示例#22
0
 def test_from_no_conf_plus_spark_home_plus_env(self):
     self.ssc = StreamingContext(master=self.master, appName=self.appName, 
                            sparkHome=self.sparkHome, environment=self.envPair,
                            duration=self.batachDuration)
     self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])
示例#23
0
 def test_from_existing_spark_context(self):
     self.sc = SparkContext(master=self.master, appName=self.appName)
     self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
示例#24
0
# 6.2.3절

from pyspark import SparkContext, SparkConf
from pyspark.streaming.context import StreamingContext

conf = SparkConf()
sc = SparkContext(master="local[*]", appName="QueueSample", conf=conf)
ssc = StreamingContext(sc, 3)

rdd1 = sc.parallelize(["a", "b", "c"])
rdd2 = sc.parallelize(["c", "d", "e"])

queue = [rdd1, rdd2]

ds = ssc.queueStream(queue)

ds.pprint()

ssc.start()
ssc.awaitTermination()
示例#25
0
from pyspark import SparkContext
from pyspark.streaming.context import StreamingContext

sc = SparkContext()
ssc = StreamingContext(
    sc, 1
)  # specify batchDuration, in this case, all data accumulated within 1 second window is going into one batch

rddData = [sc.parallelize([i])
           for i in range(1000)]  # stream is a sequence of RDDs

inSteram = ssc.queueStream(rddData)
filter1 = inStream.filter(lambda x: x % 2 == 0)
filter2 = filter1.filter(lambda x: x % 3 == 0)
collected = filter2
collected.pprint()

ssc.start()  # triggers stream to run until ssc.stop() is run
# > 0
# >
# >
# >
# >
# >
# > 6
# >
# >
# >
# > ... any multiples of 6...
ssc.stop()
示例#26
0
 def test_existing_spark_context_with_settings(self):
     conf = SparkConf()
     conf.set("spark.cleaner.ttl", "10")
     self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
     self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
     self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
示例#27
0
 def setupFunc():
     ssc = StreamingContext(self.sc, self.duration)
     ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
     self.setupCalled = True
     return ssc
示例#28
0
 def test_from_no_conf_plus_spark_home(self):
     self.ssc = StreamingContext(master=self.master, appName=self.appName, 
                            sparkHome=self.sparkHome, duration=self.batachDuration)
     self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
 def __init__(self, sc, spark):
     self.sc = sc
     self.spark = spark
     self.ssc = StreamingContext(self.sc, 1)
示例#30
0
 def setUpClass(cls):
     super(StreamingTest, cls).setUpClass()
     cls.ssc = StreamingContext(cls.sc, cls.interval)