Пример #1
0
    def perform_training(sc: SparkContext, params_dict: dict):
        batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[
            'batch_duration']
        training_duration = 20 if 'training_duration' not in params_dict else params_dict[
            'training_duration']
        ssc = StreamingContext(sc, batch_duration)
        topics = ['normal-ekg-stream']
        kafka_params = {'metadata.broker.list': 'localhost:9092'}
        kvs = KafkaUtils.createDirectStream(
            ssc,
            topics,
            kafkaParams=kafka_params,
            valueDecoder=lambda val: json.loads(val.decode('utf-8')))

        windowed_signal = kvs.map(lambda msg: Vectors.dense(
            [float(value) for value in msg[1]['signal_values']]))

        # windowed_signal.foreachRDD(Plotter.plot_signal_window)
        model = StreamingKMeans(k=20,
                                decayFactor=1.0).setRandomCenters(188, 1.0, 0)
        model.trainOn(windowed_signal)

        ssc.start()
        ssc.awaitTerminationOrTimeout(training_duration)
        ssc.stop(stopSparkContext=False, stopGraceFully=True)

        return model.latestModel()
Пример #2
0
def main():
    pwords = load_wordlist("./Dataset/positive.txt")
    nwords = load_wordlist("./Dataset/negative.txt")

    conf = SparkConf().setMaster("local[2]").setAppName("TweeStreamer")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8')))
    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords))
    tweetsUsentiment.pprint()

    #tweetsUsentiment.saveToCassandra("killranalytics", "real_time_data")

    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop(stopGraceFully=True)
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("twitterStream")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 15)  # batch interval 15 seconds
    ssc.checkpoint("checkpoint")

    # load
    nFeelingWords = load("./Dataset/nFeeling.txt")
    pFeelingWords = load("./Dataset/pFeeling.txt")

    # accept kafka data
    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitterStream'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore"))
    words = tweets.flatMap(lambda line: line.split(" "))
    nfeelings = words.map(lambda word: ('nfeelings', 1)
                          if word in nFeelingWords else ('nfeelings', 0))
    pfeelings = words.map(lambda word: ('pfeelings', 1)
                          if word in pFeelingWords else ('pfeelings', 0))
    bothFeelings = pfeelings.union(nfeelings)
    feelingCounts = bothFeelings.reduceByKey(lambda x, y: x + y)
    currentFeelingCounts = feelingCounts.updateStateByKey(sumCount)
    currentFeelingCounts.pprint()

    counts = []
    feelingCounts.foreachRDD(lambda t, rdd: counts.append(rdd.collect()))

    ssc.start()
    ssc.awaitTerminationOrTimeout(45)
    ssc.stop(stopGraceFully=True)
    constructPlot(counts)
Пример #4
0
def start(port, duration=40, jobID='', batch_interval=20):
    '''
    All Spark Streaming options go here
    '''

    # Create a local StreamingContext with two working thread and
    #   batch interval of 1 second
    sc = SparkContext('local[2]', 'NetworkWordCount')
    ssc = StreamingContext(sc, batch_interval)

    # Create a DStream that will connect to hostname:port, like localhost:9999
    lines = ssc.socketTextStream(HOST, port)

    text = lines.map(lambda post: get_json(post))\
                .filter(lambda post: post is not None)\
                .filter(lambda post: 'created_at' in post)\
                .filter(lambda post: is_valid_string_format(post['text']))\
                .map(lambda post: post['created_at'] + ' | ' + post['text'])
    '''
    No write to disk option! This will compute sentiment on the fly without
    first writing all text to temporary disk storage. To utilize this map(),
    users will need to only save the sentiment counts to a file and read
    from it in the runner.py code
    '''
    # sentiment_counts = cleaned_text.map(
    #     lambda text: (discretized_vader(text), 1)
    # ).reduceByKey(lambda x, y: x + y)

    # sentiment_counts.pprint()

    text.saveAsTextFiles('./text', suffix=jobID)

    ssc.start()
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop()
Пример #5
0
def main():
    pwords = load_wordlist("./Dataset/positive.txt")
    nwords = load_wordlist("./Dataset/negative.txt")

    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")
    sql = SQLContext(sc)
    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8')))
    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords))

    #searchTermSentiment =
    tweetsUsentiment.pprint()

    tweetsUsentiment.saveToCassandra("tweetdb", "tweettable")

    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop(stopGraceFully=True)
Пример #6
0
def net_streaming():
    '''
	功能     :连接到TCP服务器,接收处理Socket数据流
	参数host : TCP服务器IP
	参数port : TCP服务器端口
	'''

    if len(sys.argv) != 3:
        print "usage: chapter9_streaming.py <tcp host> <tcp port>"
        return -1

    host, port = sys.argv[1], sys.argv[2]

    sc = SparkContext(appName="pyspark_net_streaming")
    stream_sc = StreamingContext(sc, 1)
    socketTexts = stream_sc.socketTextStream(host, int(port))

    counts = socketTexts.flatMap(lambda line: line.split(" "))\
                           .map(lambda word: (word, 1))\
             .reduceByKey(lambda a, b: a+b)

    counts.pprint(24)
    stream_sc.start()
    stream_sc.awaitTerminationOrTimeout(timeout=30)
    stream_sc.stop(stopSparkContext=False, stopGraceFully=True)
Пример #7
0
def kafka_streaming():
    '''
    功能:接收处理kafka消息	
	'''

    if len(sys.argv) != 3:
        print "Usage: chapter9_streaming.py <zookeepr host> <topic name>"
        return -1

    sc = SparkContext(appName="kafka_streaming")
    ssc = StreamingContext(sc, 1)

    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a+b)

    counts.pprint(24)

    ssc.start()
    ssc.awaitTerminationOrTimeout(timeout=30)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)
def main():
    # load
    nfeeling_words = load_word_list(
        "/home/peterli2he1/spark/Dataset/nFeeling.txt")
    pfeeling_words = load_word_list(
        "/home/peterli2he1/spark/Dataset/pFeeling.txt")

    # Initialize spark streaming context
    conf = SparkConf().setAppName("TwitterStreamApplication")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("checkpoint_TwitterStreamApplication")

    # Processing data from Kafka
    kstream = KafkaUtils.createDirectStream(
        ssc, ["twitter-stream"], {"metadata.broker.list": "localhost:9092"})
    tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore"))
    words = tweets.flatMap(lambda line: line.split(" "))

    nfeelings = words.map(lambda word: ("nfeelings", 1)
                          if word in nfeeling_words else ("nfeelings", 0))
    pfeelings = words.map(lambda word: ("pfeelings", 1)
                          if word in pfeeling_words else ("pfeelings", 0))

    both_feelings = pfeelings.union(nfeelings)
    feeling_counts = both_feelings.reduceByKey(lambda x, y: x + y)

    counts = []
    feeling_counts.foreachRDD(lambda t, rdd: counts.append(rdd.collect()))

    ssc.start()
    ssc.awaitTerminationOrTimeout(10)
    ssc.stop(stopGraceFully=True)
    construct_plot(counts)
Пример #9
0
    def start(self, port, keyword, timeout=60):
        print("port", port)
        print("keyword", keyword)
        sc = SparkContext('local[3]',"TwitterStreamApp" + str(port))
        sc.setLogLevel("ERROR")
        # create the Streaming Context from the above spark context with interval size 3 seconds
        ssc = StreamingContext(sc, 3)
        # # setting a checkpoint to allow RDD recovery
        # ssc.checkpoint("checkpoint_TwitterApp" + str(port))
        # read data from port
        self.db = DBFireBase(keyword)
        dataStream = ssc.socketTextStream("127.0.0.1", port)
        # processing
        # split each tweet into words
        tokens = dataStream.flatMap(lambda line: wc_tokenize(_preprocess(line)))

        # count each word
        word_count = tokens.map(lambda x: (x, 1))
        word_counts = word_count.reduceByKey(lambda a, b: a+b)

        # hate speech prediction
        result = dataStream.map(lambda line: (line, predict(get_feats(np.asarray([line], dtype='U')))))

        # upload to Firebase
        word_counts.foreachRDD(self.process_rdd)
        result.foreachRDD(self.process_rdd2)
        ssc.start()
        # wait for timeout default 60s
        ssc.awaitTerminationOrTimeout(timeout)
        ssc.stop()
Пример #10
0
def Stream():
    sc = SparkContext(appName = 'NewsTwitter')
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint('checkpoint')
    kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {'twitter':1})

    parsed = kafkaStream.map(lambda v: json.loads(v[1])).map(parseText)
    parsed.cache()
    tweets_saver = parsed.map(lambda tweet: tweet + '\n').reduce(lambda x, y: x + y)
    tweets_saver.saveAsTextFiles('file:///home/lmh/Downloads/temp/lmh/text/tweets/t')
    sentiment = parsed.map(lambda tweet: [analize_sentiment(tweet), 1])
    sentiment_count = sentiment.reduceByKey(add)
    sentiment_count.cache()
    sentiment_count.saveAsTextFiles('file:///home/lmh/Downloads/temp/lmh/text/sentiment/s')
    sentiment_count.pprint()
    sentiment_fig_data = sentiment_count.updateStateByKey(cumu)

    counts = []
    sentiment_fig_data.foreachRDD(lambda s, rdd: counts.append(rdd.collect()))

    ssc.start()
    ssc.awaitTerminationOrTimeout(60)
    ssc.stop(stopGraceFully = True)

    return counts
Пример #11
0
def file_streaming_dynamic():
    '''
   	功能:处理在linux文件系统上动态生成的文件csv文件,并将处理的文件转存到其他的目录下
	'''
    sc = SparkContext.getOrCreate()
    stream_sc = StreamingContext(sc, 1)
    file_stream = stream_sc.textFileStream("file:///home/hadoop/stream/").map(
        lambda x: x.split(","))

    file_stream.pprint(24)
    file_stream.saveAsTextFiles("/home/hadoop/output/")

    stream_sc.start()
    stream_sc.awaitTerminationOrTimeout(timeout=100)
    stream_sc.stop(stopSparkContext=True, stopGraceFully=True)
Пример #12
0
def spark_analysis(topic_name):
    sc = SparkContext(appName="PythhonSpark")
    ssc = StreamingContext(sc, 60)
    kvs = KafkaUtils.createDirectStream(
        ssc, [topic_name], {"metadata.broker.list": "localhost:9092"})

    lines = kvs.map(lambda x: x[1].encode("ascii", "ignore"))
    words = lines.flatMap(lambda line: line.split("\n"))
    review = words.map(lambda word:
                       (topic_name, [analyze_sentiment(topic_name, word), 1]))
    answer = review.reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]])

    answer.pprint()
    ssc.start()
    #ssc.awaitTermination()
    ssc.awaitTerminationOrTimeout(600)
Пример #13
0
def file_streaming_static():
    """
	   功能  :  监控在HDFS上的json文件,读取数据将处理的结果保存会HDFS
	"""
    sc = SparkContext.getOrCreate()
    spark = SparkSession(sc)

    stream_sc = StreamingContext(sc, 1)
    file_data = stream_sc.textFileStream(
        "hdfs://127.0.0.1:9000/test_data").map(lambda x: x.split(","))

    file_data.pprint(24)
    file_data.foreachRDD(file_save)

    stream_sc.start()
    stream_sc.awaitTerminationOrTimeout(timeout=30)
    stream_sc.stop(stopSparkContext=True, stopGraceFully=True)
Пример #14
0
def startStreaming(socketio):
    # Functions used to process incoming data

    def process_batch(batch):
        global clf
        # convert byte lines into tweets jsons
        #tweet = json.loads(json.loads(batch[1])) # TO GET DICT

        tweet = json.loads(batch[1])
        tweet = pd.read_json(tweet, typ='series', orient='records')
        #tweet = tweet.values.reshape(1, -1)

        return tweet

    def process_RDD(rdd):
        global clf
        tweets = rdd.collect()

        if len(tweets) > 0:
            preds = clf.predict(tweets)

            bots = preds.tolist().count(0)
            humans = preds.tolist().count(1)

            print("Sending data to front-end")

            socketio.emit('update_values',
                              {'bots': bots, 'humans': humans})

    # Setup Spark

    sc = SparkContext.getOrCreate()#(appName="PythonStreamingDirectKafkaWordCount")

    ssc = StreamingContext(sc, 0.1)

    kvs = KafkaUtils.createDirectStream(ssc, ["tweet_stream"], {'metadata.broker.list': "localhost:9092"})

    tweets = kvs.map(lambda x: process_batch(x))

    tweets.foreachRDD(process_RDD)

    #counts.pprint()
    ssc.start()
    ssc.awaitTerminationOrTimeout(600)
    ssc.stop()
Пример #15
0
def main():
    # load
    nfeeling_words = load_word_list("/home/xxx/spark/Dataset/nFeeling.txt")
    pfeeling_words = load_word_list("/home/xxx/spark/Dataset/pFeeling.txt")

    # Initialize spark streaming context
    conf = SparkConf().setAppName("TwitterStreamApplication")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)  # 1 sec mini batch
    ssc.checkpoint("checkpoint_TwitterStreamApplication")

    # Processing data from Kafka
    # twitter-stream is our Kafka topic; metadata.broker.list means address of the Kafka server.
    # since KafkaUtils is from pyspark package, kstream is acutally rdds. This makes data following all rdds.
    kstream = KafkaUtils.createDirectStream(
        ssc, ["twitter-stream"], {"metadata.broker.list": "localhost:9092"})
    # default tweet format from Twitter API is JSON array with each tweet as a JSON string.
    # kstream (Kafka data format) is a list of (key, value) tuples. key: message metadata i.e. partition etc. value: message contents i.e. tweets.
    # we just want the value of kstream to analysis pos/neg feelings of the tweet, thus, x[1]. Only take English contents, ignore others.
    # map here means traverse all the kstream tuples and save what we want into "tweets" variable.
    tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore"))
    # for each line of tweet split them into list of strings.
    # then flatMap them into one large list of strings for all the tweets.
    # flatMap means 2d to 1d list.
    words = tweets.flatMap(lambda line: line.split(" "))
    nfeelings = words.map(lambda word: ("nfeelings", 1)
                          if word in nfeeling_words else ("nfeelings", 0))
    pfeelings = words.map(lambda word: ("pfeelings", 1)
                          if word in pfeeling_words else ("pfeelings", 0))
    # combine pos and neg feelings tuples into one large list
    both_feelings = pfeelings.union(nfeelings)
    # add up all the pos/neg feeling counts respectively.
    feeling_counts = both_feelings.reduceByKey(lambda x, y: x + y)

    counts = []
    # since feeling_counts is acutally rdd, use foreachRDD to add pos/neg tuples to counts list for plotting.
    # rdd.collect() action actually performs the reduceByKey transformation here.
    feeling_counts.foreachRDD(lambda t, rdd: counts.append(rdd.collect()))

    ssc.start()
    ssc.awaitTerminationOrTimeout(10)
    ssc.stop(stopGraceFully=True)

    construct_plot(counts)
Пример #16
0
def main():
    pwords = load_wordlist("../Dataset/positive.txt")
    nwords = load_wordlist("../Dataset/negative.txt")
    sterms = load_wordlist("../Dataset/keyWords.txt")
    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # Creating a streaming context with batch interval of 10 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms))

    searchTermUsentiment = tweetsUsentiment.flatMap(
        lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey(
            lambda a, b: a + b)
    searchTermUsentiment = searchTermUsentiment.map(
        lambda (key, value): {
            "searchterm": "_" + key,
            "insertion_time": datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'),
            "sentiment": value
        })
    searchTermUsentiment.pprint()

    searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable")
    # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms))

    ssc.start()
    ssc.awaitTerminationOrTimeout(1000)
    ssc.stop(stopGraceFully=True)
Пример #17
0
def main():
    # load positive/negative word list
    # Use absolute file paths if necessary
    nfeeling_words = load_word_list("../word_monitor/dataset/nFeeling.txt")
    pfeeling_words = load_word_list("../word_monitor/dataset/pFeeling.txt")

    # Initialize spark streaming context
    conf = SparkConf().setAppName("TwitterStreamApplication")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("checkpoint_TwitterStreamApplication")

    # Processing data from Kafka
    kstream = KafkaUtils.createDirectStream(
        ssc, ["twitter-stream"], {"metadata.broker.list": "localhost:9092"})
    tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore")
                         )  # Extract and only keep ascii-supported messages
    words = tweets.flatMap(lambda line: line.split(" ")
                           )  # Extract all words in tweets and put in a list
    nfeelings = words.map(lambda word: ("nfeelings", 1)
                          if word in nfeeling_words else ("nfeelings", 0))
    pfeelings = words.map(lambda word: ("pfeelings", 1)
                          if word in pfeeling_words else ("pfeelings", 0))
    both_feelings = pfeelings.union(nfeelings)
    feeling_counts = both_feelings.reduceByKey(
        lambda x, y: x + y
    )  # Reduce by key (nfeelings or pfeelings) -> [(nfeelings, count), (pfeelings, count)]

    counts = []
    feeling_counts.foreachRDD(lambda t, rdd: counts.append(rdd.collect(
    )))  # foreachRDD() takes a function that has two parameter: (time, rdd)

    ssc.start()
    ssc.awaitTerminationOrTimeout(10)  # Set running time
    ssc.stop(stopGraceFully=True)

    construct_plot(counts)
Пример #18
0
def main():
    tickerSymbols = load_wordlist("../Dataset/tickerSymbols.txt")
    conf = SparkConf().\
    setMaster("local[2]").\
    setAppName("StockStreamer").\
    set("spark.cassandra.connection.host",\
    "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")

    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    kstream = KafkaUtils.createDirectStream(\
    ssc, topics = ['stock-topic1'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    stock = kstream.map(lambda (key, value): json.loads(value))
    stock.pprint()
    stock.saveToCassandra("tweetdb", "stocktable")

    # Start the computation
    ssc.start()
    ssc.awaitTerminationOrTimeout(10000)
    ssc.stop(stopGraceFully=True)
Пример #19
0
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    #ssc.checkpoint("checkpoint")
    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['stock-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})
    stock = kstream.map(lambda (key, value): json.loads(value))
    stock.pprint()

    #value = stock.map(lambda stock1: stock1[u'bidaskvalvol'])
    #value.pprint()
    #text_counts = stock.map(lambda stockQ: (stockQ['bidaskvalvol'],1)).reduceByKey(lambda x,y: x + y)

    #text_counts.pprint()

    # Start the computation
    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop(stopGraceFully=True)
Пример #20
0
#Step 7-4-3. Creating sum of each row of numbers.


def stringToNumberSum(data):
    removedSpaceData = data.strip()
    if removedSpaceData == '':
        return (None)
    splittedData = removedSpaceData.split(' ')
    numData = [float(x) for x in splittedData]
    sumOfData = sum(numData)
    return (sumOfData)


dataInString = '10 10 20 '

stringToNumberSum(dataInString)

#Step 7-4-4. Reading data from Kafka and getting sum of each row.

from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext
bookStreamContext = StreamingContext(sc, 10)
bookKafkaStream = KafkaUtils.createStream(ssc=bookStreamContext,
                                          zkQuorum='localhost:2185',
                                          groupId='pysparkBookGroup',
                                          topics={'pysparkBookTopic': 1})
sumedData = bookKafkaStream.map(lambda data: stringToNumberSum(data[1]))
sumedData.pprint()
bookStreamContext.start()
bookStreamContext.awaitTerminationOrTimeout(30)
Пример #21
0
class StreamingContextTests(PySparkStreamingTestCase):

    duration = 0.1
    setupCalled = False

    def _add_input_stream(self):
        inputs = [range(1, x) for x in range(101)]
        stream = self.ssc.queueStream(inputs)
        self._collect(stream, 1, block=False)

    def test_stop_only_streaming_context(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)

    def test_stop_multiple_times(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.ssc.stop(False)

    def test_queue_stream(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        result = self._collect(dstream, 3)
        self.assertEqual(input, result)

    def test_text_file_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream2 = self.ssc.textFileStream(d).map(int)
        result = self._collect(dstream2, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "w") as f:
                f.writelines(["%d\n" % i for i in range(10)])
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], result)

    def test_binary_records_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream = self.ssc.binaryRecordsStream(d, 10).map(
            lambda v: struct.unpack("10b", bytes(v)))
        result = self._collect(dstream, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "wb") as f:
                f.write(bytearray(range(10)))
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])

    def test_union(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        dstream2 = self.ssc.queueStream(input)
        dstream3 = self.ssc.union(dstream, dstream2)
        result = self._collect(dstream3, 3)
        expected = [i * 2 for i in input]
        self.assertEqual(expected, result)

    def test_transform(self):
        dstream1 = self.ssc.queueStream([[1]])
        dstream2 = self.ssc.queueStream([[2]])
        dstream3 = self.ssc.queueStream([[3]])

        def func(rdds):
            rdd1, rdd2, rdd3 = rdds
            return rdd2.union(rdd3).union(rdd1)

        dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)

        self.assertEqual([2, 3, 1], self._take(dstream, 3))

    def test_transform_pairrdd(self):
        # This regression test case is for SPARK-17756.
        dstream = self.ssc.queueStream(
            [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd))
        self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3))

    def test_get_active(self):
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

    def test_get_active_or_create(self):
        # Test StreamingContext.getActiveOrCreate() without checkpoint data
        # See CheckpointTests for tests with checkpoint data
        self.ssc = None
        self.assertEqual(StreamingContext.getActive(), None)

        def setupFunc():
            ssc = StreamingContext(self.sc, self.duration)
            ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
            self.setupCalled = True
            return ssc

        # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that getActiveOrCreate() returns active context and does not call the setupFunc
        self.ssc.start()
        self.setupCalled = False
        self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() calls setupFunc after active context is stopped
        self.ssc.stop(False)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

    def test_await_termination_or_timeout(self):
        self._add_input_stream()
        self.ssc.start()
        self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001))
        self.ssc.stop(False)
        self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
Пример #22
0
                                   reverse=True)
    for i in range(3):
        print("   " + str(word_count_dict_tuple[i]))


# --------------------------------------------------------------------------
# DStream processing
# --------------------------------------------------------------------------
textDataRDD = ssc.textFileStream(Directory)
textDataRDD = textDataRDD.map(lambda row: row.split(SPACE_CHARACTER)[3])
count_barrage = textDataRDD.flatMap(splitWord).map(lambda word: (word, 1))
count_word = count_barrage.reduceByKey(lambda a, b: a + b)
count_word.foreachRDD(topBarrage)
# print(count)

# --------------------------------------------------------------------------
# DStream main()
# --------------------------------------------------------------------------
ssc.start()
ssc.awaitTerminationOrTimeout(64)
ssc.stop()

print("DStream finished.")
f = open('result.txt', 'a')
f.write("result:")
f.write(
    str(
        sorted(accumulate_barrage.items(), key=lambda kv: kv[1],
               reverse=True)[:10]))
f.close()
Пример #23
0
        def run_spark_job(queue: Queue,
                          _agg_function: AggregationFunction,
                          _agg_window_millis: int,
                          _spark_opts: dict = {},
                          _environment: dict = {}):
            os.environ.update(_environment)
            try:
                try:
                    import findspark
                    findspark.init()
                except Exception as ex:
                    self.logger.warn("Cannot import Spark pyspark with"
                                     " findspark. Message: {}".format(str(ex)))
                    pass

                from pyspark.sql import SparkSession
                from pyspark.streaming import StreamingContext
                from pyspark.sql.functions import expr, window
                from pyspark.serializers import NoOpSerializer
                from pyspark.streaming import DStream
                from pyspark.streaming.kafka import utf8_decoder

                spark_builder = SparkSession \
                    .builder \

                for k in _spark_opts:
                    spark_builder = spark_builder.config(k, _spark_opts[k])

                spark_builder \
                    .appName(str(self)) \
                    .config("spark.jars.packages",
                            "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1,"
                            "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \
                    .config("spark.jars",
                            BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar")

                spark = spark_builder.getOrCreate()
                spark.sparkContext.setLogLevel("WARN")
                ssc = StreamingContext(spark.sparkContext,
                                       (agg_window_millis / 1000))

                agg = expr("value")
                if _agg_function == AggregationFunction.AVG:
                    agg = expr("avg(value)")
                elif _agg_function == AggregationFunction.SUM:
                    agg = expr("sum(value)")
                elif _agg_function == AggregationFunction.COUNT:
                    agg = expr("count(value)")
                elif _agg_function == AggregationFunction.P50:
                    agg = expr("percentile(value, 0.5)")
                elif _agg_function == AggregationFunction.P75:
                    agg = expr("percentile(value, 0.75)")
                elif _agg_function == AggregationFunction.P95:
                    agg = expr("percentile(value, 0.95)")
                elif _agg_function == AggregationFunction.P99:
                    agg = expr("percentile(value, 0.99)")

                deserializer = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.SparkPubsubMessageSerializer()  # noqa: E501
                pubsub_utils = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.PubsubUtils
                credentials = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.SparkGCPCredentials
                storage_level = \
                    ssc._jvm.org.apache.spark.storage.StorageLevel

                _pubsub_stream = pubsub_utils \
                    .createStream(ssc._jssc,
                                  project_id,
                                  subscription,
                                  credentials.Builder().build(),
                                  storage_level.DISK_ONLY())
                _pubsub_stream_des = _pubsub_stream.transform(deserializer)
                ser = NoOpSerializer()
                pubsub_stream = DStream(_pubsub_stream_des, ssc,
                                        ser).map(utf8_decoder)

                def aggregate_rdd(_queue, _agg, df, ts):

                    secs = int(self.agg_window_millis / 1000)
                    win = window("ts", "{}  seconds".format(secs))
                    if df.first():
                        aggs = df \
                            .groupBy("application", win) \
                            .agg(_agg.alias("value")) \
                            .collect()

                        for row in aggs:
                            message = InputMessage(row["application"],
                                                   value=row["value"],
                                                   ts=ts)
                            self.logger.debug("Enqueue: {}".format(
                                message.to_json()))
                            try:
                                _queue.put(message.to_json())
                            except AssertionError as ex:
                                self.logger.warn(str(ex))
                    else:
                        self.logger.warn("Empty RDD")

                # Create kafka stream
                pubsub_stream \
                    .foreachRDD(lambda ts, rdd:
                                aggregate_rdd(queue, agg,
                                              spark.read.json(rdd), ts))

                # Run
                ssc.start()
                if "timeout" in _spark_opts:
                    ssc.awaitTerminationOrTimeout(_spark_opts["timeout"])
                    ssc.stop()
                    spark.stop()
                else:
                    ssc.awaitTermination()
                    ssc.stop()
                    spark.stop()

            except Exception as e:
                raise e
Пример #24
0
Файл: task.py Проект: whisk/ccc
  dstream = dstream.flatMap(extract_carr_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q13':
  dstream = dstream.flatMap(extract_weekday_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q21':
  dstream = dstream.flatMap(extract_origin_carrier_dep_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q22':
  dstream = dstream.flatMap(extract_origin_destination_dep_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q23':
  dstream = dstream.flatMap(extract_route_carrier_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q24':
  dstream = dstream.flatMap(extract_route_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q32':
  get_cass().execute('truncate %s' % schema['table'])
  dstream = dstream.flatMap(extract_trip_info).foreachRDD(save_trip)
else:
  print("Unknown task")

# runner
ts_last_data = time.time()
ssc.start()
while True:
  res = ssc.awaitTerminationOrTimeout(args.run_interval)
  if res:
    # stopped elsewhere
    break
  else:
    # still running
    if time.time() - ts_last_data > args.idle_time:
      dump("No data received for %d seconds, stopping..." % args.idle_time)
      ssc.stop(stopSparkContext=True, stopGraceFully=False)
Пример #25
0
        def run_spark_job(queue: Queue,
                          _agg_function: AggregationFunction,
                          _agg_window_millis: int,
                          _spark_opts: dict = {},
                          _environment: dict = {}):
            os.environ.update(_environment)
            try:
                try:
                    import findspark
                    findspark.init()
                except Exception as ex:
                    self.logger.warn("Cannot import Spark pyspark with"
                                     " findspark. Message: {}".format(str(ex)))
                    pass

                from pyspark.sql import SparkSession
                from pyspark.streaming import StreamingContext
                from pyspark.streaming.kafka import KafkaUtils
                from pyspark.sql.functions import expr, window

                spark_builder = SparkSession \
                    .builder \

                for k in _spark_opts:
                    spark_builder = spark_builder.config(k, _spark_opts[k])

                spark_builder = spark_builder \
                    .appName(str(self)) \
                    .config("spark.jars.packages",
                            "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1,"
                            "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \
                    .config("spark.jars",
                            BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar")

                spark = spark_builder.getOrCreate()
                spark.sparkContext.setLogLevel("WARN")
                ssc = StreamingContext(spark.sparkContext,
                                       (agg_window_millis / 1000))

                agg = expr("value")
                if _agg_function == AggregationFunction.AVG:
                    agg = expr("avg(value)")
                elif _agg_function == AggregationFunction.SUM:
                    agg = expr("sum(value)")
                elif _agg_function == AggregationFunction.COUNT:
                    agg = expr("count(value)")
                elif _agg_function == AggregationFunction.P50:
                    agg = expr("percentile(value, 0.5)")
                elif _agg_function == AggregationFunction.P75:
                    agg = expr("percentile(value, 0.75)")
                elif _agg_function == AggregationFunction.P95:
                    agg = expr("percentile(value, 0.95)")
                elif _agg_function == AggregationFunction.P99:
                    agg = expr("percentile(value, 0.99)")

                kafka_stream = KafkaUtils.createDirectStream(
                    ssc, [self.input_topic],
                    {"metadata.broker.list": ",".join(self.broker_servers)})

                def aggregate_rdd(_queue, _agg, df, ts):

                    secs = int(self.agg_window_millis / 1000)
                    win = window("ts", "{}  seconds".format(secs))
                    if df.first():
                        aggs = df \
                            .groupBy("application", win) \
                            .agg(_agg.alias("value")) \
                            .collect()

                        for row in aggs:
                            message = InputMessage(row["application"],
                                                   value=row["value"],
                                                   ts=ts)
                            self.logger.debug("Enqueue: {}".format(
                                message.to_json()))
                            try:
                                _queue.put(message.to_json())
                            except AssertionError as ex:
                                self.logger.warn(str(ex))
                    else:
                        warnings.warn("Empty RDD")

                # Create kafka stream
                kafka_stream \
                    .map(lambda x: x[1]) \
                    .foreachRDD(lambda ts, rdd:
                                aggregate_rdd(queue, agg,
                                              spark.read.json(rdd), ts))

                # Run
                ssc.start()
                if "timeout" in _spark_opts:
                    ssc.awaitTerminationOrTimeout(_spark_opts["timeout"])
                    ssc.stop()
                    spark.stop()
                else:
                    ssc.awaitTermination()
                    ssc.stop()
                    spark.stop()

            except Exception as e:
                raise e
Пример #26
0
class BiliSparkStreaming():
    def __init__(self, master):
        self.master = master
        scf = SparkConf().setAppName("BiliSpark").setMaster(self.master).set("spark.cores.max", "3")
        self.sc = SparkContext(conf=scf)
        # sc.setLogLevel(logging.WARNING)
        '''监控文件目录'''
        self.monitor_directory = "/Users/chenhao/Documents/BiliSpark/data"
        '''写入文件目录'''
        self.writeDirectory = '/Users/chenhao/Documents/BiliData/data/'
        self.streamingContext = StreamingContext(self.sc, 10)
        sparkSession = SparkSession.builder.config(conf=scf).getOrCreate()

        self.mongo = MongoDB("mongodb://localhost:27017/", "biliSpark")

        self.months = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
        '''存储目前为止所有的数据'''
        self.total_data = None

    '''获取SparkSession实例'''
    def getSparkSessionInstance(self, sparkConf):
        if ('sparkSessionSingletonInstance' not in globals()):
            globals()['sparkSessionSingletonInstance'] = SparkSession \
                .builder \
                .config(conf=sparkConf) \
                .getOrCreate()
        return globals()['sparkSessionSingletonInstance']
    '''监控并处理数据'''
    def monitor_process_data(self):
        bili_data = self.streamingContext.textFileStream(self.monitor_directory)
        bili_data = bili_data.map(lambda line: (line.split(",")[1],
                                                line.split(",")[2],
                                                int(line.split(",")[3]),
                                                int(line.split(",")[4]),
                                                line.split(",")[5]
                                  ))
        bili_data.foreachRDD(self.process_data)
        self.streamingContext.start()
        self.streamingContext.awaitTerminationOrTimeout(2000)
    '''每一个rdd进行处理数据'''
    def process_data(self, time, rdd):
        print("时间:" + str(time))
        if not rdd.isEmpty():
            try:
                spark = self.getSparkSessionInstance(rdd.context.getConf())
                rowRDD = rdd.map(lambda x: Row(tv=x[0], label=x[1], play=x[2],
                                               dm=x[3], month=x[4]))
                '''转换为DataFrame'''
                temp_bili_data = spark.createDataFrame(rowRDD)
                temp_bili_data.show()
                '''目前为止所有的数据'''
                if self.total_data is None:
                    self.total_data = temp_bili_data.toPandas()
                else:
                    self.total_data = pd.concat([self.total_data, temp_bili_data.toPandas()],
                                                axis=0, sort=True,
                                                ignore_index=True)
            except BaseException as e:
                print(e)
        '''记录目前为止所有的数据'''
        if self.total_data is not None:
            '''根据番剧计算播放量和弹幕量  所有月份的'''
            tv_bili = self.total_data.groupby(['tv']).sum().reset_index()

            '''将所有月的播放量和弹幕存储到MongoDB中'''
            tv_bili_json = self.db2Json(tv_bili)
            for tv_bili_temp_json in tv_bili_json:
                # print(tv_bili)
                query = {
                    'tv': tv_bili_temp_json['tv']
                }
                # print(query)
                self.insert_mongo('allmonth_play_dm', query, tv_bili_temp_json)

            '''每个月各个番剧的播放量或者弹幕并排序'''
            for tempmonth in self.months:
                '''筛选出各个月份的播放量和弹幕'''
                temp_month_tv = self.total_data[self.total_data['month'] == tempmonth]
            #     '''根据番剧名字进行叠加'''
                temp_month_tv_bymonth = temp_month_tv.groupby(["tv"]).sum().reset_index()
                if not temp_month_tv.empty:
                    '''由于groupby时将月份去除了,因此要重新加上这一列'''
                    temp_month_tv_bymonth['month'] = tempmonth
                    temp_month_tv_bymonth_json = self.db2Json(temp_month_tv_bymonth)
                    print(temp_month_tv_bymonth_json)
                    for x in temp_month_tv_bymonth_json:
                        query = {
                            'tv': x['tv'],
                            'month': tempmonth #月份查询条件不要忘记
                        }
                        # print(x)
                        self.insert_mongo('bymonth_play_dm', query, x)
                if not temp_month_tv.empty:
                    '''各个月份的番剧的播放量'''
                    temp_month_tv_sort = temp_month_tv_bymonth.sort_values(by='play', ascending=False)
                    self.write_playOrdm_bymonth(temp_month_tv_sort, tempmonth, 'play')
                if not temp_month_tv.empty:
                    '''各个月份的番剧的弹幕'''
                    temp_month_dm_sort = temp_month_tv_bymonth.sort_values(by='dm', ascending=False)
                    self.write_playOrdm_bymonth(temp_month_dm_sort, tempmonth, 'dm')
            self.write_playOrdm_Allmonth(tv_bili, 'play')
            self.write_playOrdm_Allmonth(tv_bili, 'dm')

    '''DataFrame转换为json数据'''
    def db2Json(self, db):
        json_record = db.to_json(orient='records')
        return json.loads(json_record)

    '''根据月份统计各个番剧的播放量或者弹幕数'''
    def write_playOrdm_bymonth(self, data, month, dmorplay):
        filepath = self.writeDirectory + month + '月-' + dmorplay + ".txt"
        row = data.shape[0]
        with open(filepath, 'w') as w:
            for i in range(row):
                tempdata = data.iloc[i]
                # print(data[key] + str(data[value]))
                w.write(tempdata['tv'] + ":" + str(tempdata[dmorplay]) + "\n")

    '''将各个番剧的播放量或者弹幕数写入文件'''
    def write_playOrdm_Allmonth(self, data, dmorPlay):
        filepath = self.writeDirectory + 'tv' + "- " + dmorPlay + ".txt"
        row = data.shape[0]
        with open(filepath, 'w') as w:
            for i in range(row):
                tempdata = data.iloc[i]
                # print(data[key] + str(data[value]))
                w.write(tempdata['tv'] + ":" + str(tempdata[dmorPlay]) + "\n")

    '''将数据插入Mongo数据库,存在就更新数据'''
    def insert_mongo(self, collection, query, data):
        self.mongo.insertOrUpdate(collection, query, data)
Пример #27
0
            temp[i].append(p.encode('utf-8', 'ignore'))
            temp[i].append(q)
            temp[i].append(r)
            i += 1
        i = 0
        for p in prediction.collect():
            temp[i].append(p)
            i += 1

        print(temp)
        for i in temp:

            insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2]))
    else:
        print("Empty RDD !!!")
        pass


twitter = tweets.map(lambda tweet: tweet['user']['screen_name'])
tweet_text = tweets.map(lambda tweet: tweet['text'])

txt = tweets.map(lambda x: (x['text'], x['user']['screen_name'], x['id']))
txt.foreachRDD(process_data)

#text = tweet_text.map(lambda x: x.encode('utf-8','ignore'))
#text.foreachRDD(process_data)

ssc.start()
ssc.awaitTerminationOrTimeout(1000)
ssc.stop(stopGraceFully=True)
Пример #28
0
sc.setCheckpointDir("/tmp")  # for stable state
ssc = StreamingContext(sc, 0.01)

rddQ = []
for filename in os.listdir("data/split"):
    rddQ.append(sc.textFile("data/split/" + filename))
# rddQ.append(sc.textFile("data/split/aa"))

result = []


def update_result(rdd):
    global result
    result = rdd.top(10)


# processing
dstream = ssc.queueStream(rddQ)
dstream = sclean(dstream)
dstream = scount(dstream)
dstream\
  .map(lambda x: (x[1],x[0]))\
  .foreachRDD(lambda rdd: update_result(rdd))

ssc.start()
ssc.awaitTerminationOrTimeout(30)
ssc.stop()

for (k, v) in result:
    print(str(k) + " " + str(v))
Пример #29
0
dataFilePathOnHdfs = "hdfs://{}/btsdata/aviation/ontime/".format(master)

conf = SparkConf().setAppName(APP_NAME).setMaster('spark://{}:7077'.format(master))
sc = SparkContext(conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
ssc.checkpoint('/tmp/ccc')

lines = ssc.textFileStream(dataFilePathOnHdfs)



res2_2 = lines.map(lambda line : line.split(","))				\
			  .filter(lambda line : line[6] == originAirport)		\ 	# 2nd argument: 'SRQ', 'CMH', 'JFK', 'SEA', or 'BOS'
			  .map(lambda line : (line[7], float(line[12])))	\	# (Carrier, Departure Delay)
			  .combineByKey(lambda x : (x, 1), 					\
			  				lambda x, y : (x[0] + y, x[1] + 1), \	# (sum, count)
			  				lambda x, y : (x[0] + y[0], x[1] + y[1]) ) \
			  .map(lambda (key, (valueSum, count) : (key, valueSum / count))) \
			  .sortByKey('ascending')


ssc.start()
while true:
	if ssc.awaitTerminationOrTimeout(10):
		break
	else:
		pass
print res2_2.take(10)
print "Gracefully stopping Spark Streaming Application"
ssc.stop(stopSparkContext = True, stopGracefully = True)
print "Application stoppped"
Пример #30
0
class BasicOperationTests(PySparkStreamingTestCase):

    def test_map(self):
        """Basic operation test for DStream.map."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.map(str)
        expected = [list(map(str, x)) for x in input]
        self._test_func(input, func, expected)

    def test_flatMap(self):
        """Basic operation test for DStream.flatMap."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.flatMap(lambda x: (x, x * 2))
        expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x))))
                    for x in input]
        self._test_func(input, func, expected)

    def test_filter(self):
        """Basic operation test for DStream.filter."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.filter(lambda x: x % 2 == 0)
        expected = [[y for y in x if y % 2 == 0] for x in input]
        self._test_func(input, func, expected)

    def test_count(self):
        """Basic operation test for DStream.count."""
        input = [range(5), range(10), range(20)]

        def func(dstream):
            return dstream.count()
        expected = [[len(x)] for x in input]
        self._test_func(input, func, expected)

    def test_slice(self):
        """Basic operation test for DStream.slice."""
        import datetime as dt
        self.ssc = StreamingContext(self.sc, 1.0)
        self.ssc.remember(4.0)
        input = [[1], [2], [3], [4]]
        stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input])

        time_vals = []

        def get_times(t, rdd):
            if rdd and len(time_vals) < len(input):
                time_vals.append(t)

        stream.foreachRDD(get_times)

        self.ssc.start()
        self.wait_for(time_vals, 4)
        begin_time = time_vals[0]

        def get_sliced(begin_delta, end_delta):
            begin = begin_time + dt.timedelta(seconds=begin_delta)
            end = begin_time + dt.timedelta(seconds=end_delta)
            rdds = stream.slice(begin, end)
            result_list = [rdd.collect() for rdd in rdds]
            return [r for result in result_list for r in result]

        self.assertEqual(set([1]), set(get_sliced(0, 0)))
        self.assertEqual(set([2, 3]), set(get_sliced(1, 2)))
        self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4)))
        self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))

    def test_reduce(self):
        """Basic operation test for DStream.reduce."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.reduce(operator.add)
        expected = [[reduce(operator.add, x)] for x in input]
        self._test_func(input, func, expected)

    def test_reduceByKey(self):
        """Basic operation test for DStream.reduceByKey."""
        input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
                 [("", 1), ("", 1), ("", 1), ("", 1)],
                 [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]

        def func(dstream):
            return dstream.reduceByKey(operator.add)
        expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
        self._test_func(input, func, expected, sort=True)

    def test_mapValues(self):
        """Basic operation test for DStream.mapValues."""
        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
                 [(0, 4), (1, 1), (2, 2), (3, 3)],
                 [(1, 1), (2, 1), (3, 1), (4, 1)]]

        def func(dstream):
            return dstream.mapValues(lambda x: x + 10)
        expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
                    [(0, 14), (1, 11), (2, 12), (3, 13)],
                    [(1, 11), (2, 11), (3, 11), (4, 11)]]
        self._test_func(input, func, expected, sort=True)

    def test_flatMapValues(self):
        """Basic operation test for DStream.flatMapValues."""
        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
                 [(0, 4), (1, 1), (2, 1), (3, 1)],
                 [(1, 1), (2, 1), (3, 1), (4, 1)]]

        def func(dstream):
            return dstream.flatMapValues(lambda x: (x, x + 10))
        expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12),
                     ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
                    [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
                    [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
        self._test_func(input, func, expected)

    def test_glom(self):
        """Basic operation test for DStream.glom."""
        input = [range(1, 5), range(5, 9), range(9, 13)]
        rdds = [self.sc.parallelize(r, 2) for r in input]

        def func(dstream):
            return dstream.glom()
        expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
        self._test_func(rdds, func, expected)

    def test_mapPartitions(self):
        """Basic operation test for DStream.mapPartitions."""
        input = [range(1, 5), range(5, 9), range(9, 13)]
        rdds = [self.sc.parallelize(r, 2) for r in input]

        def func(dstream):
            def f(iterator):
                yield sum(iterator)
            return dstream.mapPartitions(f)
        expected = [[3, 7], [11, 15], [19, 23]]
        self._test_func(rdds, func, expected)

    def test_countByValue(self):
        """Basic operation test for DStream.countByValue."""
        input = [list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""]]

        def func(dstream):
            return dstream.countByValue()
        expected = [[(1, 2), (2, 2), (3, 2), (4, 2)],
                    [(5, 2), (6, 2), (7, 1), (8, 1)],
                    [("a", 2), ("b", 1), ("", 1)]]
        self._test_func(input, func, expected, sort=True)

    def test_groupByKey(self):
        """Basic operation test for DStream.groupByKey."""
        input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
                 [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]

        def func(dstream):
            return dstream.groupByKey().mapValues(list)

        expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
                    [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
                    [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
        self._test_func(input, func, expected, sort=True)

    def test_combineByKey(self):
        """Basic operation test for DStream.combineByKey."""
        input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
                 [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]

        def func(dstream):
            def add(a, b):
                return a + str(b)
            return dstream.combineByKey(str, add, add)
        expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
                    [(1, "111"), (2, "11"), (3, "1")],
                    [("a", "11"), ("b", "1"), ("", "111")]]
        self._test_func(input, func, expected, sort=True)

    def test_repartition(self):
        input = [range(1, 5), range(5, 9)]
        rdds = [self.sc.parallelize(r, 2) for r in input]

        def func(dstream):
            return dstream.repartition(1).glom()
        expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]]
        self._test_func(rdds, func, expected)

    def test_union(self):
        input1 = [range(3), range(5), range(6)]
        input2 = [range(3, 6), range(5, 6)]

        def func(d1, d2):
            return d1.union(d2)

        expected = [list(range(6)), list(range(6)), list(range(6))]
        self._test_func(input1, func, expected, input2=input2)

    def test_cogroup(self):
        input = [[(1, 1), (2, 1), (3, 1)],
                 [(1, 1), (1, 1), (1, 1), (2, 1)],
                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]]
        input2 = [[(1, 2)],
                  [(4, 1)],
                  [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]]

        def func(d1, d2):
            return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs)))

        expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))],
                    [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))],
                    [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]]
        self._test_func(input, func, expected, sort=True, input2=input2)

    def test_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.join(b)

        expected = [[('b', (2, 3))]]
        self._test_func(input, func, expected, True, input2)

    def test_left_outer_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.leftOuterJoin(b)

        expected = [[('a', (1, None)), ('b', (2, 3))]]
        self._test_func(input, func, expected, True, input2)

    def test_right_outer_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.rightOuterJoin(b)

        expected = [[('b', (2, 3)), ('c', (None, 4))]]
        self._test_func(input, func, expected, True, input2)

    def test_full_outer_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.fullOuterJoin(b)

        expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]]
        self._test_func(input, func, expected, True, input2)

    def test_update_state_by_key(self):

        def updater(vs, s):
            if not s:
                s = []
            s.extend(vs)
            return s

        input = [[('k', i)] for i in range(5)]

        def func(dstream):
            return dstream.updateStateByKey(updater)

        expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
        expected = [[('k', v)] for v in expected]
        self._test_func(input, func, expected)

    def test_update_state_by_key_initial_rdd(self):

        def updater(vs, s):
            if not s:
                s = []
            s.extend(vs)
            return s

        initial = [('k', [0, 1])]
        initial = self.sc.parallelize(initial, 1)

        input = [[('k', i)] for i in range(2, 5)]

        def func(dstream):
            return dstream.updateStateByKey(updater, initialRDD=initial)

        expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
        expected = [[('k', v)] for v in expected]
        self._test_func(input, func, expected)

    def test_failed_func(self):
        # Test failure in
        # TransformFunction.apply(rdd: Option[RDD[_]], time: Time)
        input = [self.sc.parallelize([d], 1) for d in range(4)]
        input_stream = self.ssc.queueStream(input)

        def failed_func(i):
            raise ValueError("This is a special error")

        input_stream.map(failed_func).pprint()
        self.ssc.start()
        try:
            self.ssc.awaitTerminationOrTimeout(10)
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue("This is a special error" in failure)
            return

        self.fail("a failed func should throw an error")

    def test_failed_func2(self):
        # Test failure in
        # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time)
        input = [self.sc.parallelize([d], 1) for d in range(4)]
        input_stream1 = self.ssc.queueStream(input)
        input_stream2 = self.ssc.queueStream(input)

        def failed_func(rdd1, rdd2):
            raise ValueError("This is a special error")

        input_stream1.transformWith(failed_func, input_stream2, True).pprint()
        self.ssc.start()
        try:
            self.ssc.awaitTerminationOrTimeout(10)
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue("This is a special error" in failure)
            return

        self.fail("a failed func should throw an error")

    def test_failed_func_with_reseting_failure(self):
        input = [self.sc.parallelize([d], 1) for d in range(4)]
        input_stream = self.ssc.queueStream(input)

        def failed_func(i):
            if i == 1:
                # Make it fail in the second batch
                raise ValueError("This is a special error")
            else:
                return i

        # We should be able to see the results of the 3rd and 4th batches even if the second batch
        # fails
        expected = [[0], [2], [3]]
        self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3))
        try:
            self.ssc.awaitTerminationOrTimeout(10)
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue("This is a special error" in failure)
            return

        self.fail("a failed func should throw an error")
Пример #31
0
                  
    airportAirports.checkpoint(60)
    airportAirports.foreachRDD(outputQ2N2)

    carriersA2A.checkpoint(60)
    carriersA2A.foreachRDD(outputQ2N3)

    topHopFlights.checkpoint(60)
    topHopFlights.foreachRDD(outputQ3N2)
    
    print("STARTED!")
    ssc.start()
    runStatus = 1
    
    while True:
        res = ssc.awaitTerminationOrTimeout(10) # 10 seconds timeout
        if dataSaved1 and dataSaved2 and dataSaved3 and dataSaved4 and dataSaved5 and dataSaved6:
            runStatus = 0
        if res:
            # stopped elsewhere
            break
        else:
            # still running
            timerCount+=1
            print("still running...%d" % timerCount)
                        
            if runStatus == 0:
                print("Finish saving data. Stopping streaming...")
                ssc.stop(stopSparkContext=True, stopGraceFully=True)
                break
            
Пример #32
0
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
import functions as app

# Spark Context
conf = SparkConf().setMaster('local[2]').setAppName('ApacheWebLogsStream')
sc = SparkContext(conf=conf)
sc.setLogLevel('OFF')

# Spark Streaming Context
ssc = StreamingContext(sparkContext=sc, batchDuration=10)
input_stream = ssc.socketTextStream(hostname='localhost', port=9999)

# Process stream
input_stream.foreachRDD(app.process)

ssc.start()
ssc.awaitTerminationOrTimeout(timeout=900)
ssc.stop()
Пример #33
0
    #datamap = tx_fee_rdd.map(lambda x: ("tx_fee",x) )  
    #( rowkey , [ row key , column family , column name , value ] )
    datamap = tx_fee_rdd.map(lambda x: (str(x[0]),
					       	[str(x[0]),"tx_fee_col","tx_fee",str(x[1])])
    						)
 			
    datamap.saveAsNewAPIHadoopDataset(conf=conf,
    								  keyConverter=keyConv,
    								  valueConverter=valueConv)


lines = ssc.socketTextStream("localhost", 8888)
dump_rdd = lines.map(lambda x: json.dumps(x))
load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore'))
#load_rdd.pprint(2)

split_blk_rdd = load_rdd.map(lambda x: x.split(":"))
#split_blk_rdd.pprint()

tx_fee_rdd = split_blk_rdd.map(lambda x : (x[14][1:7],x[15][1:-15])) #this gets transaction fee
#tx_fee_rdd.pprint(200)		#works
tx_fee_rdd.foreachRDD(SaveRecord)		#function call




ssc.start()             # Start the computation
#ssc.awaitTermination()  # Wait for the computation to terminate
ssc.awaitTerminationOrTimeout(15000) #13000#time out in 3 hours
#ssc.stop()  # Wait for the computation to terminate
Пример #34
0
Файл: task.py Проект: whisk/ccc
    dstream = dstream.flatMap(
        extract_origin_destination_dep_delay).reduceByKey(lambda a, b: (a[
            0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average)
elif args.task == 'q23':
    dstream = dstream.flatMap(extract_route_carrier_arr_delay).reduceByKey(
        lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(
            top_complex_average)
elif args.task == 'q24':
    dstream = dstream.flatMap(extract_route_arr_delay).reduceByKey(
        lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average)
elif args.task == 'q32':
    get_cass().execute('truncate %s' % schema['table'])
    dstream = dstream.flatMap(extract_trip_info).foreachRDD(save_trip)
else:
    print("Unknown task")

# runner
ts_last_data = time.time()
ssc.start()
while True:
    res = ssc.awaitTerminationOrTimeout(args.run_interval)
    if res:
        # stopped elsewhere
        break
    else:
        # still running
        if time.time() - ts_last_data > args.idle_time:
            dump("No data received for %d seconds, stopping..." %
                 args.idle_time)
            ssc.stop(stopSparkContext=True, stopGraceFully=False)
Пример #35
0
class BasicOperationTests(PySparkStreamingTestCase):

    def test_map(self):
        """Basic operation test for DStream.map."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.map(str)
        expected = [list(map(str, x)) for x in input]
        self._test_func(input, func, expected)

    def test_flatMap(self):
        """Basic operation test for DStream.flatMap."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.flatMap(lambda x: (x, x * 2))
        expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x))))
                    for x in input]
        self._test_func(input, func, expected)

    def test_filter(self):
        """Basic operation test for DStream.filter."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.filter(lambda x: x % 2 == 0)
        expected = [[y for y in x if y % 2 == 0] for x in input]
        self._test_func(input, func, expected)

    def test_count(self):
        """Basic operation test for DStream.count."""
        input = [range(5), range(10), range(20)]

        def func(dstream):
            return dstream.count()
        expected = [[len(x)] for x in input]
        self._test_func(input, func, expected)

    def test_slice(self):
        """Basic operation test for DStream.slice."""
        import datetime as dt
        self.ssc = StreamingContext(self.sc, 1.0)
        self.ssc.remember(4.0)
        input = [[1], [2], [3], [4]]
        stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input])

        time_vals = []

        def get_times(t, rdd):
            if rdd and len(time_vals) < len(input):
                time_vals.append(t)

        stream.foreachRDD(get_times)

        self.ssc.start()
        self.wait_for(time_vals, 4)
        begin_time = time_vals[0]

        def get_sliced(begin_delta, end_delta):
            begin = begin_time + dt.timedelta(seconds=begin_delta)
            end = begin_time + dt.timedelta(seconds=end_delta)
            rdds = stream.slice(begin, end)
            result_list = [rdd.collect() for rdd in rdds]
            return [r for result in result_list for r in result]

        self.assertEqual(set([1]), set(get_sliced(0, 0)))
        self.assertEqual(set([2, 3]), set(get_sliced(1, 2)))
        self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4)))
        self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))

    def test_reduce(self):
        """Basic operation test for DStream.reduce."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.reduce(operator.add)
        expected = [[reduce(operator.add, x)] for x in input]
        self._test_func(input, func, expected)

    def test_reduceByKey(self):
        """Basic operation test for DStream.reduceByKey."""
        input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
                 [("", 1), ("", 1), ("", 1), ("", 1)],
                 [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]

        def func(dstream):
            return dstream.reduceByKey(operator.add)
        expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
        self._test_func(input, func, expected, sort=True)

    def test_mapValues(self):
        """Basic operation test for DStream.mapValues."""
        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
                 [(0, 4), (1, 1), (2, 2), (3, 3)],
                 [(1, 1), (2, 1), (3, 1), (4, 1)]]

        def func(dstream):
            return dstream.mapValues(lambda x: x + 10)
        expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
                    [(0, 14), (1, 11), (2, 12), (3, 13)],
                    [(1, 11), (2, 11), (3, 11), (4, 11)]]
        self._test_func(input, func, expected, sort=True)

    def test_flatMapValues(self):
        """Basic operation test for DStream.flatMapValues."""
        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
                 [(0, 4), (1, 1), (2, 1), (3, 1)],
                 [(1, 1), (2, 1), (3, 1), (4, 1)]]

        def func(dstream):
            return dstream.flatMapValues(lambda x: (x, x + 10))
        expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12),
                     ("c", 1), ("c", 11), ("d", 1), ("d", 11)],
                    [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)],
                    [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]]
        self._test_func(input, func, expected)

    def test_glom(self):
        """Basic operation test for DStream.glom."""
        input = [range(1, 5), range(5, 9), range(9, 13)]
        rdds = [self.sc.parallelize(r, 2) for r in input]

        def func(dstream):
            return dstream.glom()
        expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
        self._test_func(rdds, func, expected)

    def test_mapPartitions(self):
        """Basic operation test for DStream.mapPartitions."""
        input = [range(1, 5), range(5, 9), range(9, 13)]
        rdds = [self.sc.parallelize(r, 2) for r in input]

        def func(dstream):
            def f(iterator):
                yield sum(iterator)
            return dstream.mapPartitions(f)
        expected = [[3, 7], [11, 15], [19, 23]]
        self._test_func(rdds, func, expected)

    def test_countByValue(self):
        """Basic operation test for DStream.countByValue."""
        input = [list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""]]

        def func(dstream):
            return dstream.countByValue()
        expected = [[(1, 2), (2, 2), (3, 2), (4, 2)],
                    [(5, 2), (6, 2), (7, 1), (8, 1)],
                    [("a", 2), ("b", 1), ("", 1)]]
        self._test_func(input, func, expected, sort=True)

    def test_groupByKey(self):
        """Basic operation test for DStream.groupByKey."""
        input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
                 [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]

        def func(dstream):
            return dstream.groupByKey().mapValues(list)

        expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])],
                    [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])],
                    [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]]
        self._test_func(input, func, expected, sort=True)

    def test_combineByKey(self):
        """Basic operation test for DStream.combineByKey."""
        input = [[(1, 1), (2, 1), (3, 1), (4, 1)],
                 [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)],
                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]]

        def func(dstream):
            def add(a, b):
                return a + str(b)
            return dstream.combineByKey(str, add, add)
        expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")],
                    [(1, "111"), (2, "11"), (3, "1")],
                    [("a", "11"), ("b", "1"), ("", "111")]]
        self._test_func(input, func, expected, sort=True)

    def test_repartition(self):
        input = [range(1, 5), range(5, 9)]
        rdds = [self.sc.parallelize(r, 2) for r in input]

        def func(dstream):
            return dstream.repartition(1).glom()
        expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]]
        self._test_func(rdds, func, expected)

    def test_union(self):
        input1 = [range(3), range(5), range(6)]
        input2 = [range(3, 6), range(5, 6)]

        def func(d1, d2):
            return d1.union(d2)

        expected = [list(range(6)), list(range(6)), list(range(6))]
        self._test_func(input1, func, expected, input2=input2)

    def test_cogroup(self):
        input = [[(1, 1), (2, 1), (3, 1)],
                 [(1, 1), (1, 1), (1, 1), (2, 1)],
                 [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]]
        input2 = [[(1, 2)],
                  [(4, 1)],
                  [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]]

        def func(d1, d2):
            return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs)))

        expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))],
                    [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))],
                    [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]]
        self._test_func(input, func, expected, sort=True, input2=input2)

    def test_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.join(b)

        expected = [[('b', (2, 3))]]
        self._test_func(input, func, expected, True, input2)

    def test_left_outer_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.leftOuterJoin(b)

        expected = [[('a', (1, None)), ('b', (2, 3))]]
        self._test_func(input, func, expected, True, input2)

    def test_right_outer_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.rightOuterJoin(b)

        expected = [[('b', (2, 3)), ('c', (None, 4))]]
        self._test_func(input, func, expected, True, input2)

    def test_full_outer_join(self):
        input = [[('a', 1), ('b', 2)]]
        input2 = [[('b', 3), ('c', 4)]]

        def func(a, b):
            return a.fullOuterJoin(b)

        expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]]
        self._test_func(input, func, expected, True, input2)

    def test_update_state_by_key(self):

        def updater(vs, s):
            if not s:
                s = []
            s.extend(vs)
            return s

        input = [[('k', i)] for i in range(5)]

        def func(dstream):
            return dstream.updateStateByKey(updater)

        expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
        expected = [[('k', v)] for v in expected]
        self._test_func(input, func, expected)

    def test_update_state_by_key_initial_rdd(self):

        def updater(vs, s):
            if not s:
                s = []
            s.extend(vs)
            return s

        initial = [('k', [0, 1])]
        initial = self.sc.parallelize(initial, 1)

        input = [[('k', i)] for i in range(2, 5)]

        def func(dstream):
            return dstream.updateStateByKey(updater, initialRDD=initial)

        expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
        expected = [[('k', v)] for v in expected]
        self._test_func(input, func, expected)

    def test_failed_func(self):
        # Test failure in
        # TransformFunction.apply(rdd: Option[RDD[_]], time: Time)
        input = [self.sc.parallelize([d], 1) for d in range(4)]
        input_stream = self.ssc.queueStream(input)

        def failed_func(i):
            raise ValueError("This is a special error")

        input_stream.map(failed_func).pprint()
        self.ssc.start()
        try:
            self.ssc.awaitTerminationOrTimeout(10)
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue("This is a special error" in failure)
            return

        self.fail("a failed func should throw an error")

    def test_failed_func2(self):
        # Test failure in
        # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time)
        input = [self.sc.parallelize([d], 1) for d in range(4)]
        input_stream1 = self.ssc.queueStream(input)
        input_stream2 = self.ssc.queueStream(input)

        def failed_func(rdd1, rdd2):
            raise ValueError("This is a special error")

        input_stream1.transformWith(failed_func, input_stream2, True).pprint()
        self.ssc.start()
        try:
            self.ssc.awaitTerminationOrTimeout(10)
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue("This is a special error" in failure)
            return

        self.fail("a failed func should throw an error")

    def test_failed_func_with_reseting_failure(self):
        input = [self.sc.parallelize([d], 1) for d in range(4)]
        input_stream = self.ssc.queueStream(input)

        def failed_func(i):
            if i == 1:
                # Make it fail in the second batch
                raise ValueError("This is a special error")
            else:
                return i

        # We should be able to see the results of the 3rd and 4th batches even if the second batch
        # fails
        expected = [[0], [2], [3]]
        self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3))
        try:
            self.ssc.awaitTerminationOrTimeout(10)
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue("This is a special error" in failure)
            return

        self.fail("a failed func should throw an error")
Пример #36
0
class StreamingContextTests(PySparkStreamingTestCase):

    duration = 0.1
    setupCalled = False

    def _add_input_stream(self):
        inputs = [range(1, x) for x in range(101)]
        stream = self.ssc.queueStream(inputs)
        self._collect(stream, 1, block=False)

    def test_stop_only_streaming_context(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)

    def test_stop_multiple_times(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.ssc.stop(False)

    def test_queue_stream(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        result = self._collect(dstream, 3)
        self.assertEqual(input, result)

    def test_text_file_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream2 = self.ssc.textFileStream(d).map(int)
        result = self._collect(dstream2, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "w") as f:
                f.writelines(["%d\n" % i for i in range(10)])
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], result)

    def test_binary_records_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream = self.ssc.binaryRecordsStream(d, 10).map(
            lambda v: struct.unpack("10b", bytes(v)))
        result = self._collect(dstream, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "wb") as f:
                f.write(bytearray(range(10)))
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])

    def test_union(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        dstream2 = self.ssc.queueStream(input)
        dstream3 = self.ssc.union(dstream, dstream2)
        result = self._collect(dstream3, 3)
        expected = [i * 2 for i in input]
        self.assertEqual(expected, result)

    def test_transform(self):
        dstream1 = self.ssc.queueStream([[1]])
        dstream2 = self.ssc.queueStream([[2]])
        dstream3 = self.ssc.queueStream([[3]])

        def func(rdds):
            rdd1, rdd2, rdd3 = rdds
            return rdd2.union(rdd3).union(rdd1)

        dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)

        self.assertEqual([2, 3, 1], self._take(dstream, 3))

    def test_transform_pairrdd(self):
        # This regression test case is for SPARK-17756.
        dstream = self.ssc.queueStream(
            [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd))
        self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3))

    def test_get_active(self):
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

    def test_get_active_or_create(self):
        # Test StreamingContext.getActiveOrCreate() without checkpoint data
        # See CheckpointTests for tests with checkpoint data
        self.ssc = None
        self.assertEqual(StreamingContext.getActive(), None)

        def setupFunc():
            ssc = StreamingContext(self.sc, self.duration)
            ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
            self.setupCalled = True
            return ssc

        # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that getActiveOrCreate() returns active context and does not call the setupFunc
        self.ssc.start()
        self.setupCalled = False
        self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() calls setupFunc after active context is stopped
        self.ssc.stop(False)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

    def test_await_termination_or_timeout(self):
        self._add_input_stream()
        self.ssc.start()
        self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001))
        self.ssc.stop(False)
        self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
Пример #37
0
	#print gen_tx_json
	return gen_tx_json


#get lines RDD
lines = ssc.socketTextStream("localhost", 9999)
dump_rdd = lines.map(lambda x: json.dumps(x))
#print dump_rdd.take(2)
load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore'))
#print load_rdd.take(2)

#load_rdd.pprint(100)
#tx = load_rdd.flatMap(lambda x: x.split(":")) #this works
split_blk_rdd = load_rdd.map(lambda x: x.split(":"))
#split_blk_rdd.pprint()

gen_tx_rdd = split_blk_rdd.map(lambda x : (x[8][1:7],x[6][4:68]) ) #this gets generation transactions
#gen_tx_rdd.pprint()		#works

tx_json_rdd = gen_tx_rdd.map(lambda x: (x[0],get_tx_fee(x[1])) )	#function call			  
tx_fee_rdd = tx_json_rdd.map(lambda x : (x[0],x[1].items()
										[3][1][0]["value"]-25) )#.filter(lambda x : "value" in x)

tx_fee_rdd.foreachRDD(SaveRecord)		#function call


ssc.start()             # Start the computation
#ssc.awaitTermination()  # Wait for the computation to terminate
ssc.awaitTerminationOrTimeout(12000) #time out 3.33 hours
#ssc.stop()  # Wait for the computation to terminate