예제 #1
18
def main(ssc):
    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()
예제 #2
2
def stream(ssc):

    zkQuorum = "localhost:2181"
    topic = "topic1"
    tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"})

    tweets = tweets.map(lambda x: x[1].encode("ascii","ignore"))
    return tweets
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None):
    """Starts a Spark Streaming job from a Kafka input and parses message time

	WARNING!! This function only works for spark 1.4.0+ 

	Args:
		brokers: the kafka broker that we look at for the topic
		topic: the kafka topic for input
		timeinterval: the time interval in seconds (int) that the job will 
			bucket

	Returns:
		None
		
	"""
    sc = SparkContext(appName="PythonKafkaBucketCounter")
    ssc = StreamingContext(sc, timeinterval + 5)

    if valueDecoder:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder)
    else:
        kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})

    lines = kvs.map(lambda x: x[1])
    interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b)

    output_msg_func = output_msg(sc, ssc)

    interval_counts.foreachRDD(output_msg_func)

    ssc.start()
    ssc.awaitTermination()
    def get_kafka_stream(topic, streaming_context):
        offset_specifications = simport.load(cfg.CONF.repositories.offsets)()
        app_name = streaming_context.sparkContext.appName
        saved_offset_spec = offset_specifications.get_kafka_offsets(app_name)
        if len(saved_offset_spec) < 1:

            MonMetricsKafkaProcessor.log_debug(
                "No saved offsets available..."
                "connecting to kafka without specifying offsets")
            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers})

            return kvs

        else:
            from_offsets = {}
            for key, value in saved_offset_spec.items():
                if key.startswith("%s_%s" % (app_name, topic)):
                    # spec_app_name = value.get_app_name()
                    spec_topic = value.get_topic()
                    spec_partition = int(value.get_partition())
                    # spec_from_offset = value.get_from_offset()
                    spec_until_offset = value.get_until_offset()
                    # composite_key = "%s_%s_%s" % (spec_app_name,
                    #                               spec_topic,
                    #                               spec_partition)
                    # partition = saved_offset_spec[composite_key]
                    from_offsets[
                        TopicAndPartition(spec_topic, spec_partition)
                    ] = long(spec_until_offset)

            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream :"
                " topic:{%s} : start " % topic)
            for key, value in from_offsets.items():
                MonMetricsKafkaProcessor.log_debug(
                    "get_kafka_stream: calling createDirectStream : "
                    "offsets : TopicAndPartition:{%s,%s}, value:{%s}" %
                    (str(key._topic), str(key._partition), str(value)))
            MonMetricsKafkaProcessor.log_debug(
                "get_kafka_stream: calling createDirectStream : "
                "topic:{%s} : done" % topic)

            kvs = KafkaUtils.createDirectStream(
                streaming_context, [topic],
                {"metadata.broker.list": cfg.CONF.messaging.brokers},
                from_offsets)
            return kvs
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})

    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    pword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("positive",1) if word in pwords else ("positive",0)).reduceByKey(lambda a,b:a+b)
    nword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("negative",1) if word in nwords else ("negative",0)).reduceByKey(lambda a,b:a+b)

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # make the plot on this rdd -combined_rdd

    combined_rdd=pword_rdd.union(nword_rdd)
    running_counts=combined_rdd.updateStateByKey(updateFunction)
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]

    counts = []
    combined_rdd.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    # print "printing dstream"
    running_counts.pprint()
		
	# Start the computation
    ssc.start()                         
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
예제 #6
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
예제 #7
0
def bro_parse(zk,topic,db,db_table,num_of_workers):
    
    app_name = "ONI-INGEST-{0}".format(topic)
    wrks = int(num_of_workers)

 	# create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc,1)
    sqc = HiveContext(sc)

    # create DStream for each topic partition.
    topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks)  ] 
    tp_stream = ssc.union(*topic_dstreams)

    # Parallelism in Data Processing
    #processingDStream = tp_stream(wrks)

    # parse the RDD content.
    proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1]))

    # save RDD into hive .
    proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic))

    ssc.start()
    ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # YOUR CODE HERE
   
    words=tweets.flatMap(lambda x: x.split(" ")).filter(lambda x: x in pwords or x in nwords) 
    wordPairs=words.map(lambda x: ("positive",1) if x in pwords else ("negative",1))

    wordCount=wordPairs.reduceByKey(lambda x, y: x + y)
    
    runningCounts = wordPairs.updateStateByKey(updateFunction)

    runningCounts.pprint()
    

    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    wordCount.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
예제 #9
0
파일: tests.py 프로젝트: anitatailor/spark
    def test_kafka_direct_stream_transform_get_offsetRanges(self):
        """Test the Python direct Kafka stream transform get offsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(),
                       "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)

        offsetRanges = []

        def transformWithOffsetRanges(rdd):
            for o in rdd.offsetRanges():
                offsetRanges.append(o)
            return rdd

        # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together,
        # only the TransformedDstreams can be folded together.
        stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint()
        self.ssc.start()
        self.wait_for(offsetRanges, 1)

        self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))
    #print "HELOKOJOJEORUBEORUBOUBEROUBNOUONEROJOEJRNOJENROJENFOJEFOEJFNOEFUNOEUFN"
    #tweets.pprint()
    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # YOUR CODE HERE
    words = tweets.flatMap(lambda line: line.split(" "))
    pairs = words.map(classifier).map(lambda word: (word, 1)).filter(lambda x: x[0] != 'none').reduceByKey(lambda a,b: a+b)
    runningCounts = pairs.updateStateByKey(updateFunction)
    runningCounts.pprint()
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    pairs.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)
    #print counts
    return counts
예제 #11
0
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    # YOUR CODE HERE
    words = tweets.flatMap(lambda line: line.split(' ')) \
            .map(lambda word: ('positive', 1) if word in pwords else ('negative', 1) if word in nwords else ('none', 1)) \
            .filter(lambda x: x[0]=='positive' or x[0]=='negative') \
            .reduceByKey(lambda x, y: x + y)
    # Print the first ten elements of each RDD generated in this DStream to the console
    def updateValues(values, count):
        if count is None:
            count = 0
        return sum(values, count)

    updatedWords = words.updateStateByKey(updateValues)
    updatedWords.pprint()
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    # YOURDSTREAMOBJECT.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    words.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))

    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
예제 #12
0
def main():
    if len(sys.argv) != 4:
        print("Usage: kafka_wordcount.py <zk> <topic> <timeout>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)
    timeout = None
    if len(sys.argv) == 4:
        zk, topic, timeout = sys.argv[1:]
        timeout = int(timeout)
    else:
        zk, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(
        ssc, zk, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: (line.split(" "))
                           .map(lambda word: (word, 1))
                           .reduceByKey(lambda a, b: a+b))
    counts.pprint()
    kwargs = {}
    if timeout:
        kwargs['timeout'] = timeout
    ssc.start()
    ssc.awaitTermination(**kwargs)
예제 #13
0
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list":'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    
    # Each element of tweets will be the text of a tweet.
    # Need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    pnTweets = tweets.flatMap(lambda line: line.split(" "))
    pnTweetsPairs = pnTweets.map(lambda x: determine(x,pwords,nwords))
    wordCounts = pnTweetsPairs.reduceByKey(lambda x, y: x + y)
    
    totalCounts = pnTweetsPairs.updateStateByKey(updateFunction)
    totalCounts.pprint()
    # Let the counts variable hold the word counts for all time steps
    # Need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    wordCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)
    # becaue counts include those neither ones
    
    newCounts = []
    for count in counts:
        newCount = [item for item in count if item[0] == "positive" or item[0] =="negative"]
        newCounts.insert(len(newCounts),newCount)
    
    return newCounts
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))


    # Print the first ten elements of each RDD generated in this DStream to the console
    #tweets.pprint()
    words = tweets.flatMap(lambda line: line.split(" "))

    posNegPairs = words.map(lambda word: myMapping(word, pwords, nwords))
    filteredPairs = posNegPairs.filter(lambda x: x[0] != "na")
    posNegCounts = filteredPairs.reduceByKey(lambda x, y: x + y)


    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).

    cumulativeCounts = posNegCounts.updateStateByKey(myRunningUpdate)
    cumulativeCounts.pprint()    
    
    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    posNegCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))
    
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)
    return counts
예제 #15
0
def start_spark(timeout=None, max_items_per_rdd_sent=None):
    sc = SparkContext("local[4]", "twitter.trending")
    ssc = StreamingContext(sc, 5)

    ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')

    kafka_params = {
        'zookeeper.connect': config.get('zookeeper', 'host'),
        'group.id': config.get('kafka', 'group_id'),
        'metadata.broker.list': config.get('kafka', 'hosts')
    }

    ksc = KafkaUtils.createDirectStream(ssc,
                                        [config.get('kafka', 'topic')],
                                        kafka_params)

    hashtag_counts = get_word_counts(ksc)
    filtered_tweet_count = filter_tweets(hashtag_counts)
    send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
    ssc.start()
    if timeout:
        ssc.awaitTermination(timeout)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
    else:
        ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))
    tweets = tweets.flatMap(lambda line: line.split(" "))
    words = tweets.flatMap(lambda line: line.split(" "))
    tweets = tweets.filter(lambda x: x in pwords or x in nwords)
    tweets = tweets.map(lambda x: ("positive",1) if x in pwords else ("negative",1))
    tweets = tweets.reduceByKey(lambda x,y: x+y)
    tweets = tweets.updateStateByKey(updateFunction)
    tweets.pprint()

    pds = words.filter(lambda x: x in pwords)
    nds = words.filter(lambda x: x in nwords)

    plist=[]
    nlist=[]

    pds.foreachRDD(lambda t,rdd: plist.append(rdd.count()))    
    nds.foreachRDD(lambda t,rdd: nlist.append(rdd.count()))

    counts = []
  
    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    for i in range(0,len(plist)):
        counts.append((plist[i],nlist[i]))

    return counts
예제 #17
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
예제 #18
0
def main():
    sc = SparkContext(appName="IntrusionDetector")
    ssc = StreamingContext(sc, batch_durations)

    kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker})
    kvs.foreachRDD(processRDD)
    ssc.start()
    ssc.awaitTermination()
    def readSource(ssc, di_in_conf_with_ds_conf, app_conf):
        sourceType = di_in_conf_with_ds_conf['source.type']

        if sourceType == 'kafka':
            kafkaSimpleConsumerApiUsed = app_conf.get('kafka.simple.consumer.api.used', True)
            if kafkaSimpleConsumerApiUsed:
                topics = di_in_conf_with_ds_conf['topics']
                if not isinstance(topics, list):
                    raise TypeError("topic should be list")

                brokers = di_in_conf_with_ds_conf['metadata.broker.list']
                kafkaParams = {"metadata.broker.list": brokers}
                stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams).map(lambda x: x[1])
            else:
                zkConnect = di_in_conf_with_ds_conf['zookeeper.connect']
                groupId = app_conf['group.id']
                numReceivers = app_conf.get('num.receivers', 1)
                numConsumerFetchers = app_conf.get('num.consumer.fetchers')
                topics = di_in_conf_with_ds_conf['topics']
                topic_map = dict(zip(topics, numConsumerFetchers))
                # streams = reduce(lambda x, y: x.union(y),
                #                  map(KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map),
                #                      range(0, numReceivers)))
                streams = [KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map) for i in range(0, numReceivers)]
                stream = ssc.union(streams).map(lambda x: x[1])
        elif sourceType == 'hdfs':
            path = di_in_conf_with_ds_conf['fs.defaultFS'] + '/' + di_in_conf_with_ds_conf['path']
            stream = ssc.textFilesStream(path)
        else:
            raise Exception('Error: unsupported source.type = ' + sourceType)

        num_repartition = app_conf.get('dataInterface.stream.repatition.partitions')
        if num_repartition is None or not isinstance(num_repartition, int):
            stream2 = stream
        else:
            stream2 = stream.repartition(num_repartition)

        # 是否使用格式化插件类格式化
        format_class_path = di_in_conf_with_ds_conf.get('format.class', '')
        if format_class_path.strip() == '':
            stream3 = stream2
        else:
            format_class_obj = get_class_obj(format_class_path)
            stream3 = format_class_obj.format(stream2)

        return stream3
예제 #20
0
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function):
    sc = SparkContext(appName=app_name)
    sqlContext = SQLContext(sc)
    # ssc = StreamingContext(sc, interval_seconds)
    ssc = StreamingContext(sc, 10)
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    kvs.foreachRDD(sql_function)
    ssc.start()
    ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration):
    kstream = KafkaUtils.createDirectStream(
        ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'})
    tweets = kstream.map(lambda x: x[1].encode("ascii","ignore"))

    # Each element of tweets will be the text of a tweet.
    # You need to find the count of all the positive and negative words in these tweets.
    # Keep track of a running total counts and print this at every time step (use the pprint function).
    

    #tweets.pprint()
    words = tweets.flatMap(lambda tweet:tweet.split(" "))
    #words.pprint()

    positive = words.filter(lambda x: (x in pwords))
    negative = words.filter(lambda x: (x in nwords))

    #positive.pprint()
    #negative.pprint()

    ppairs = positive.map(lambda p: ('positive', 1))
    npairs = negative.map(lambda n: ('negative', 1))

    pwordCounts = ppairs.reduceByKey(lambda x, y: x + y)
    nwordCounts = npairs.reduceByKey(lambda x, y: x + y)

    count = pwordCounts.union(nwordCounts)
    #count.pprint()
    #pwordCounts.pprint()
    #nwordCounts.pprint()

    def updateFunction(newValues, runningCount):
        if runningCount is None:
           runningCount = 0
        return sum(newValues, runningCount)

    prunningCounts = pwordCounts.updateStateByKey(updateFunction)
    nrunningCounts = nwordCounts.updateStateByKey(updateFunction)

    #prunningCounts.pprint()
    #nrunningCounts.pprint()

    total = prunningCounts.union(nrunningCounts)
    total.pprint()

    # Let the counts variable hold the word counts for all time steps
    # You will need to use the foreachRDD function.
    # For our implementation, counts looked like:
    #   [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...]
    counts = []
    count.foreachRDD(lambda t,rdd: counts.append(rdd.collect()))

    ssc.start()                         # Start the computation
    ssc.awaitTerminationOrTimeout(duration)
    ssc.stop(stopGraceFully=True)

    return counts
    def fetch_pre_hourly_data(spark_context,
                              offset_range_list):
        """get metrics pre hourly data from offset range list."""

        # get kafka stream over the same offsets
        pre_hourly_rdd = KafkaUtils.createRDD(spark_context,
                                              {"metadata.broker.list":
                                                  cfg.CONF.messaging.brokers},
                                              offset_range_list)
        return pre_hourly_rdd
예제 #23
0
def functionToCreateContext():
    sc = SparkContext(appName="StreamingExampleWithKafka")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
    kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
    counts.pprint()
    return ssc
def read_tweets():

    sc = SparkContext(appName="sentimentProducer")
    ssc = StreamingContext(sc,600)  # Test 60 segundos
    brokers = "localhost:9092"
    kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers})
    kvs.foreachRDD(create_format)
    producer.flush()
    ssc.start()
    ssc.awaitTermination()
    def test_kafka_direct_stream(self):
        """Test the Python direct Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams)
        self._validateStreamResult(sendData, stream)
예제 #26
0
파일: tests.py 프로젝트: anitatailor/spark
    def test_kafka_rdd(self):
        """Test the Python direct Kafka RDD API."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self._validateRddResult(sendData, rdd)
def main():
    conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    try:
        kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2})
        kafka_streams.foreachRDD(process_rdd)
    except Exception as e:
        print e
    ssc.start()
    ssc.awaitTermination()
예제 #28
0
파일: tests.py 프로젝트: anitatailor/spark
    def test_kafka_rdd_get_offsetRanges(self):
        """Test Python direct Kafka RDD get OffsetRanges."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 4, "c": 5}
        offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))]
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)
        rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges)
        self.assertEqual(offsetRanges, rdd.offsetRanges())
예제 #29
0
파일: tests.py 프로젝트: anitatailor/spark
    def test_kafka_direct_stream_from_offset(self):
        """Test the Python direct Kafka stream API with start offset specified."""
        topic = self._randomTopic()
        sendData = {"a": 1, "b": 2, "c": 3}
        fromOffsets = {TopicAndPartition(topic, 0): long(0)}
        kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets)
        self._validateStreamResult(sendData, stream)
예제 #30
0
파일: tests.py 프로젝트: anitatailor/spark
    def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 5, "c": 10}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})
        self._validateStreamResult(sendData, stream)
        print(type(value_key))
        print(type(value_all))


print("before executing")
schema_registry_client = CachedSchemaRegistryClient(
    url='http://ashaplq00003:8081')
print("after executing")
print(schema_registry_client)
serializer = MessageSerializer(schema_registry_client)
print("after executing")
print(serializer)

spark = SparkSession.builder \
  .appName('SparkCassandraApp') \
  .config('spark.cassandra.connection.host', 'localhost') \
  .config('spark.cassandra.connection.port', '9042') \
  .config('spark.cassandra.output.consistency.level','ONE') \
  .master('local[2]') \
  .getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 5)
kvs = KafkaUtils.createDirectStream(
    ssc, ['NBC_APPS.TBL_MS_ADVERTISER'],
    {"metadata.broker.list": 'ashaplq00003:9192'},
    valueDecoder=serializer.decode_message)
kvs.pprint()
kvs.foreachRDD(handler)
ssc.start()
ssc.awaitTermination()
예제 #32
0
import json
import csv
from json import loads
from flatten_json import flatten
from time import sleep
import pandas as pd

print("PROGRAM START!!!")
print("PROGRAM START!!!")
print("PROGRAM START!!!")
print("PROGRAM START!!!")

sc = SparkContext()
ssc = StreamingContext(sc, 10)
sqlc = SQLContext(sc)
directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ["kafka_spark"], {"metadata.broker.list": "localhost:9094"})
lines = directKafkaStream.map(lambda x: x[1])

print("LINES START!!!")
print("LINES START!!!")
print("LINES START!!!")
print("LINES START!!!")


def transformer(rdd):
    my_obj = json.loads(rdd)
    return (my_obj["group"]["id"], my_obj["group"]["name"],
            my_obj["group"]["latest"]["text"], my_obj["group"]["members"],
            my_obj["group"]["unread_count"])

def setup_kafka_stream():
    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})

    return kvs
예제 #34
0
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: kafka_wordcount.py <zk> <EventsTopic> ", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="biddingStream")
    ssc = StreamingContext(sc, 10)
    sqlContext = SQLContext(sc)
    bidprice=sqlContext.read\
                       .format("org.apache.spark.sql.cassandra")\
                       .options(keyspace="ad_flow", table="bidprice")\
                       .load().rdd
    tmp = {}
    for item in bidprice.collect():
        tmp[item['pid']] = item['price']
    bidPriceBC = sc.broadcast(tmp)
    #    print(tmp)
    zkQuorum, topic1 = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "sparkStreamingGetNewEvents",
                                  {topic1: 1})
    lines = kvs.map(lambda x: json.loads(x[1]))
    #    lines.pprint()
    #    uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\
    #    uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\
    #    			.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 60, 10)
    #    ssc.checkpoint('hdfs://ec2-52-2-60-169.compute-1.amazonaws.com:9000/checkpoint/')
    lines.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()
예제 #35
0
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pyspark.sql import HiveContext, Row
import json

sc = SparkContext("local[*]", "mysqltohive")
ssc = StreamingContext(sc,1)
hsql = HiveContext(sc)

topic = "test"
brokers = "datanode1:9092,datanode3:9092,datanode6:9092"
parttiton = 0
start = 8390
topicpartion = TopicAndPartition(topic, parttiton)
fromoffset = {topicpartion: long(start)}
dkafka = KafkaUtils.createDirectStream(ssc,[topic], \
         {"metadata.broker.list": brokers},fromOffsets = fromoffset)

offsetRanges = []

def storeOffsetRanges(rdd):
     global offsetRanges
     offsetRanges = rdd.offsetRanges()
     return rdd

def printOffsetRanges(rdd):
     for o in offsetRanges:
         print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset)

dkafka.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

예제 #36
0
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler

n_secs = 1
topic = "test2"

conf = SparkConf().setAppName("RealTimeDetector").setMaster("local[*]")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, n_secs)

spSession = SparkSession.builder.master("local").appName("MineCap").config("spark.some.config.option", "some-value").getOrCreate()

kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {
    'bootstrap.servers':'localhost:9092',
    'group.id':'flow-streaming',
    'fetch.message.max.bytes':'15728640',
    'auto.offset.reset':'largest'})
# Group ID is completely arbitrary

lines = kafkaStream.map(lambda x: x[1])
flows = lines.flatMap(lambda line: line.split(" "))#.map(lambda word: (word[1:-1].split(",")))

##### tratamento dos dados

#fluxoRDD = sc.textFile("/home/administrador/MineCap/process-layer/dataset_fluxo_bc.csv")
fluxoRDD = sc.textFile("/home/helio/MineCap/process-layer/dataset_novo.csv")

# Removendo a primeira linha do arquivo (cabeçalho)
firstLine = fluxoRDD.first()
fluxoRDD2 = fluxoRDD.filter(lambda x: x != firstLine)
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: consumer.py <zk> <topic>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafka")
    ssc = StreamingContext(sc, 1)

    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.pprint()

    ssc.start()
    ssc.awaitTermination()
    conf.set("spark.streaming.backpressure.enabled", "true")
    conf.set("spark.streaming.backpressure.initialRate", "100")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "100")
    sc = SparkContext(appName="Most Popular Airports", batchSize=50, conf=conf)
    sc.setLogLevel("WARN")

    ssc = StreamingContext(sc, 60)

    broker = sys.argv[1]
    source_topic = sys.argv[2]
    # destionation_topic = sys.argv[3]

    print("The broker is " + broker)
    print("The source topic is " + source_topic)

    df = KafkaUtils.createDirectStream(ssc, [source_topic],
                                       {"metadata.broker.list": broker},
                                       valueDecoder=decoder)

    df \
        .map(get_original_airport_and_destination_airport) \
        .filter(lambda line: len(line) > 3) \
        .filter(lambda line: Helpers.is_airport(line)) \
        .flatMap(lambda line: line.split(",")) \
        .countByValue() \
        .transform(lambda airports: airports.sortBy(lambda t: t[1], ascending=False)) \
        .foreachRDD(handler)

    ssc.start()
    ssc.awaitTermination()
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import json
import re

#reload(sys)
#sys.setdefaultencoding('utf-8')

if __name__ == "__main__":
    sc = SparkContext(appName="SparkStreaming")
    sqlContext = SQLContext(sc)
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 5)  # 5 seconds window

    broker, topic = sys.argv[1:]
    kvs = KafkaUtils.createDirectStream(ssc, [topic],
                                        {"metadata.broker.list": broker})

    data = kvs.map(lambda x: x[1])

    def readRdd4rmKafkaStream(readRDD):
        if not readRDD.isEmpty():
            # Put RDD into a dataframe
            df = sqlContext.read.json(readRDD)
            df.show()
            #df.registerTempTable("Businesses_data")

    data.foreachRDD(lambda rdd: readRdd4rmKafkaStream(rdd))
    print("\n\n\n\n\n\n\nHEY, CAN YOU SEE ME\n\n\n\n\n\n\n")
    ssc.start()
    ssc.awaitTermination()
예제 #40
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('topic_name', help='the name of the topic')
    parser.add_argument('kafka_broker', help='the location of the kafka')
    parser.add_argument('target_topic', help = 'the new topic to write to')

    args = parser.parse_args()
    topic_name = args.topic_name
    kafka_broker = args.kafka_broker
    target_topic = args.target_topic

    sc = SparkContext('local[2]', 'stock-price-analysis')
    sc.setLogLevel('WARN')
    ssc = StreamingContext(sc, 5) # batch second

    # direct stream
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic_name], {'metadata.broker.list':kafka_broker})
    process_stream(directKafkaStream)



    # create kafka producer
    kafka_producer = KafkaProducer(
        bootstrap_servers = kafka_broker
    )

    atexit.register(shutdown_hook, kafka_producer)

    ssc.start()
    ssc.awaitTermination()
###This streaming takes data from Apache Kafka and makes basic analyse. After analyse save it to Cassandra database.

def getSqlContextInstance(sparkContext):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']


# Create a StreamingContext with batch interval of 3 second
conf=SparkConf().setAppName("asd").set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 3)

topic = "topic_cassandraam"
kafkaStream = KafkaUtils.createStream(ssc, "localhost:2181", "topic", {topic: 4})

def process(time, rdd):
    
    try:
        
        # Get the singleton instance of SparkSession
        sqlContext = getSqlContextInstance(rdd.context)
        
        # Convert RDD[String] to RDD[Row] to DataFrame
        #d.map(lambda w: Row(word=w))
        
        df = sqlContext.createDataFrame(rdd)
        #df.registerTempTable("words")
        
        #wordCountsDataFrame = sqlContext.sql("select * from words")
예제 #42
0
            producer.send_messages('top_carriers_by_airports',
                                   message.encode())


# MAIN

sc = SparkContext(appName="TopCarriersByAirports")
sc.setLogLevel('ERROR')

# Create a local StreamingContext
ssc = StreamingContext(sc, 1)
ssc.checkpoint(
    "s3a://cloudcapstone-checkpoints/checkpoints/checkpoint-top-carriers-by-airports/"
)
lines = KafkaUtils.createDirectStream(ssc, ['input'], {
    "metadata.broker.list": sys.argv[1],
    "auto.offset.reset": "smallest"
})

# Split each line by separator
lines = lines.map(lambda tup: tup[1])
rows = lines.map(lambda line: line.split())

# Get the airports
rows = rows.filter(lambda row: len(row) > 7)
airports_and_carriers = rows.map(lambda row: ((row[0], row[3]), float(row[7])))

# Count averages
airports_and_carriers = airports_and_carriers.updateStateByKey(updateFunction)
# Change key to just airports
airports = airports_and_carriers.map(lambda row: (row[0][0],
                                                  (row[0][1], row[1][2])))
    df = spark.createDataFrame(rowRdd)
    df.write\
     .format("org.apache.spark.sql.cassandra")\
     .mode('overwrite')\
     .options(table="airport_carrier_departure", keyspace="aviation")\
     .save()


# MAIN

sc = SparkContext(appName="TopCarriersByAirportsToCassandra")
sc.setLogLevel('ERROR')

# Create a local StreamingContext
ssc = StreamingContext(sc, 1)
lines = KafkaUtils.createDirectStream(ssc, ['top_carriers_by_airports'], {
    "metadata.broker.list": sys.argv[1],
    "auto.offset.reset": "smallest"
})

# Transform
lines = lines.map(lambda message: message[1])
lines = lines.map(lambda line: line.split())
lines = lines.map(lambda tuple: (tuple[0], tuple[1], float(tuple[2])))
# Save to Cassandra
lines.foreachRDD(printResults)
lines.foreachRDD(saveToCassandra)

ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: main.py <broker_list> <topic>", file=sys.stderr)
        exit(-1)

    # read command line arguments
    brokers, topic = sys.argv[1:]

    sparkSession = (
        SparkSession.builder.appName('Santiago Meetup').getOrCreate())

    ssc = StreamingContext(sparkSession.sparkContext, BATCH_INTERVAL)

    kvs = KafkaUtils.createDirectStream(
        ssc, [topic], {
            "metadata.broker.list": brokers,
            "auto.offset.reset": STREAM_KAFKA_OFFSET
        })

    result = kvs.map(lambda record: map_tweet(record)).filter(
        lambda record: record[0] is not None).cache()

    #Send twitter reply
    result.foreachRDD(lambda rdd: rdd.foreachPartition(reply_to_tweet))

    #Save meetup_tags data to hdfs
    result.flatMap(lambda record: map_scores(record)).foreachRDD(
        lambda rdd: save_meetup_tags_to_hbase(rdd))

    #Save meetup data to hdfs
    result.map(lambda r: (r[0], r[1], r[2], r[3], r[4], r[5])).foreachRDD(
        print(e)
        print("Database insertion unsuccessful!!")
    finally:
        conn.close()


from pyspark.streaming import StreamingContext
conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
val = sc.parallelize("abd")

ssc = StreamingContext(sc, 10)
ssc.checkpoint("checkpoint")
kstream = KafkaUtils.createDirectStream(
    ssc,
    topics=['twitterstream'],
    kafkaParams={"metadata.broker.list": 'localhost:9092'})
tweets = kstream.map(lambda x: json.loads(x[1]))

with open('IRModel1', 'rb') as f:
    loadedModel = pickle.load(f)

bc_model = sc.broadcast(loadedModel)


def process_data(data):

    print("Processing data ...")

    if (not data.isEmpty()):
        nbModel = bc_model.value
예제 #46
0
from datetime import datetime
import json

from pyspark import SparkContext, SparkConf,SQLContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext



sc = SparkContext()
ssc = StreamingContext(sc, 10) # 10 Second window to get the stream from the source
sqlc= SQLContext(sc) 
  
kvs = KafkaUtils.createDirectStream(ssc, ['bigdata'], {'metadata.broker.list': 'localhost:9096'})#'sandbox-hdp.hortonworks.com:6667'  
lines = kvs.map(lambda x: x[1])


def transformer(rdd):
    my_obj=json.loads(rdd)
    now = datetime.now()
    dt_string = now.strftime("%m/%d/%Y %H:%M:%S")# mm/dd/YY H:M:S
    #return str(dt_string),str(my_obj['country_name']),str(my_obj['cases']),str(my_obj['region']),str(my_obj['new_cases']),str(my_obj['serious_critical'])
    return dt_string,my_obj['country_name'],my_obj['cases'],my_obj['region'],my_obj['new_cases'],my_obj['serious_critical']
transform=lines.map(transformer)


def build_df(rdd):
    if not rdd.isEmpty():
         global sqlc
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
if __name__ == "__main__":
    sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
    ssc = StreamingContext(sc, 2)
    # brokers, topic = sys.argv[1:]
    kvs = KafkaUtils.createDirectStream(ssc, ["kafkaTwitterSpark"],
                                        {"metadata.broker.list": 3})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a+b)
    counts.pprint()
    ssc.start()
    ssc.awaitTermination()
예제 #48
0
                      value=json.dumps(data).encode('utf-8'))
    producer.flush()
    producer.close()


# Initialisation de notre liste de topics et mots-clés associés
topic_list = {"sport": ['doping', 'olympic', 'injury','medal','record','title','world','athletic'], \
   "tech" : ['technology','author','article','computer','hi-tech','software','network','security','phone']}

# Création de notre Stream.
sc = SparkContext(appName='PythonStreamingRecieverKafka')
ssc = StreamingContext(sc, 2)  # 2 second window
zookeeper_broker = "localhost:2181"
topic = "queue1"
kvs = KafkaUtils.createStream(ssc, \
       zookeeper_broker, \
       'streaming-consumer',\
       {topic:1})

# séparation de notre stream et filtrage avant envoie dans queue2 et queue3
queue2 = kvs.filter(lambda x: queue2_filter(x))
#queue2.pprint(num=10)
queue2.foreachRDD(
    lambda rdd: rdd.foreachPartition(lambda x: send_to_queue2(x)))
queue3 = kvs.filter(lambda x: queue3_filter(x))
#queue3.pprint(num=10)
queue3.foreachRDD(
    lambda rdd: rdd.foreachPartition(lambda x: send_to_queue3(x)))

ssc.start()
ssc.awaitTermination()
예제 #49
0
    producer.flush()
    producer.close()


# Read Data files with a Custom Schema - CDR Data was obtained from the Open Big Data project by Dandelion,
# It is availabe at https://dandelion.eu/datamine/open-big-data/
DataSchema = StructType([StructField("square_id", FloatType(), True), \
                    StructField("time", StringType(), True), \
                    StructField("country", FloatType(), True), \
                    StructField("sms_in", FloatType(), True), \
                    StructField("sms_out", FloatType(), True), \
                    StructField("call_in", FloatType(), True), \
                    StructField("call_out", FloatType(), True), \
                    StructField("internet", FloatType(), True)])

kvs = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming',
                              {'realtime': 1})
lines = kvs.map(lambda x: x[1])
rows = lines.map(lambda line: line.split("\n")[0])
rows = rows.map(lambda line: line.split("\t"))
Data = rows.transform(lambda rdd: PreProcessData(rdd))
Result = Data.transform(lambda rdd: ComputeDistances(rdd))
Data = Data.transform(lambda rdd: rdd.zipWithIndex().map(lambda e:
                                                         (e[1], [e[0]])))

Data.pprint()
Result.pprint()
Final = Result.join(Data)
Final.pprint()
Final.foreachRDD(lambda rdd: rdd.foreachPartition(SendResult))

ssc.start()
예제 #50
0
        '/user/hadoop/POC/FraudAnalytics/data/transactions_history/Transaction_DB_jupyter.csv'
    transactionsDataDF = spark.read.csv(transactionsDataHDFSPath,
                                        header=True,
                                        inferSchema=True)
    print 'Printing transactions history data'
    transactionsDataDF.show(1)
    transactionsDataDF.registerTempTable('tranHistoryTbl')

    # sqlContext.cacheTable("tranHistoryTbl")

    transRecCount = \
        transactionsDataDF.filter("Card_Number = '7470510000000000'"
                                  ).select('*').count()
    print '========= Printing transactionsDataDF count ========='
    print transRecCount

    ssc = StreamingContext(sc, 10)

    # zkQuorum = "34.201.235.134:2181"
    # topic = "fraud_analytics"

    (zkQuorum, topic) = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, 'fraud_test1_consumer',
                                  {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.pprint()
    lines.foreachRDD(lambda rdd: \
                     process(time.strftime('%Y-%m-%d %H:%M:%S'), transactionsDataDF, rdd))
    ssc.start()
    ssc.awaitTermination()
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json

sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01")
sc.setLogLevel("WARN")

ssc = StreamingContext(sc, 60)

kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181',
                                      'spark-streaming', {'twitter': 1})

parsed = kafkaStream.map(lambda v: json.loads(v[1]))

parsed.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()

authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name'])

author_counts = authors_dstream.countByValue()
author_counts.pprint()

author_counts_sorted_dstream = author_counts.transform(\
  (lambda foo:foo\
   .sortBy(lambda x:( -x[1]))))
author_counts_sorted_dstream.pprint()
        except:
            pass


if __name__ == "__main__":
    try:
        #Create Spark Context to Connect Spark Cluster
        sc = SparkContext(appName="PythonStreamingKafkaTweetCount")

        #Set the Batch Interval is 2 sec of Streaming Context
        ssc = StreamingContext(sc, 2)
        # sqlContext = sql.SQLContext(sc)
        #Create Kafka Stream to Consume Data Comes From Twitter Topic
        #localhost:2181 = Default Zookeeper Consumer Address
        kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181',
                                              'spark-streaming',
                                              {'twitter': 1})
        #Parse Twitter Data as json
        parsed = kafkaStream.map(lambda v: forEachBatch(v))
        #parsed = kafkaStream.map(lambda x: x[1])
        #kafkaStream.saveAsTextFiles('test.txt')
        #Count the number of tweets per Usere
        #lines = parsed.map(lambda x: x[1])
        # tweets = parsed.map(getFeature)
        # ##print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
        #print('asdljaslkdjaslkjdasjdlkajdlasjkd',len(tweets))
        parsed.pprint()
        #tweets.saveDataToFile("1")
        # vector = np.array(tweets)
        #rdd = tweets.foreachRDD(getRDD)
        #turnIntoVector(rdd)
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="biddingStream")
    ssc = StreamingContext(sc, 10)

    #   load product vector
    pm=sc.textFile('hdfs://ec2-52-2-60-169.compute-1.amazonaws.com:9000/data/pidModel.txt')\
         .map(lambda x: x.strip().split(' '))\
         .map(lambda x: (x[0], np.asarray([float(i) for i in x[1:]])))

    #    print(pm.take(100))
    bv = sc.broadcast(pm.collect())

    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum,
                                  "sparkStreamingConsumerDetectEvents",
                                  {topic: 1})
    lines = kvs.map(lambda x: json.loads(x[1]))
    #    lines.pprint()
    uidVec = lines.map(lambda x: ((x['timestamp'], x[
        'uid']), np.asarray([float(i) for i in x['topicVec']])))
    #    uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\
    #    			.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 60, 10)
    #    ssc.checkpoint('hdfs://ec2-52-2-60-169.compute-1.amazonaws.com:9000/checkpoint/')
    window60rdd = uidVec.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()
예제 #54
0
    #     print("""
    #     Usage: SparkTest-sql-kafka-.py <bootstrap-servers> <subscribe-type> <topics>
    #     """, file=sys.stderr)
    # exit(-1)

    subscribeType = "subscribe"
    bootstrapServers = "10.0.10.10:2181"
    # 打开一个TCP socket 地址 和 端口号
    topic = {"test5": 1}  # 要列举出分区
    groupid = "sprk-consumer-group"

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    # 处理时间间隔为2s
    ssc = StreamingContext(sc, 15)

    kafkaStream = KafkaUtils.createStream(ssc, bootstrapServers, groupid,
                                          topic)
    words = kafkaStream.map(lambda x: x[1])
    words.pprint()

    # .trigger(processingTime='5 seconds') \
    # Convert RDDs of the words DStream to DataFrame and run SQL query
    def process(time, rdd):
        print("========= %s =====执行时间====" % str(time))

        try:
            # Get the singleton instance of SparkSession
            spark = getSparkSessionInstance(rdd.context.getConf())

            schemaString = "c1 c2 c3"
            fields = [
                StructField(field_name, StringType(), True)
예제 #55
0
    
    # Data to insert to the DB
    data = {'recordID': recordID,
            'c1': mac_address_enc,
            'c2': access_point_enc,
            'c3': internalNodesStr,
            'c4': stay_enc}
    insertSQL = "insert into t_range_m (recordID, c1, c2, c3, c4) values (%(recordID)s, %(c1)s, %(c2)s, %(c3)s, %(c4)s)"
    save_to_db(data, insertSQL)


# Set up the contexts
conf = SparkConf().setAppName("Smart Buildings")
sc = SparkContext(conf=conf)
stream = StreamingContext(sc, 1) # 1 second window

# Returns a DStream (Discretized Stream) object
kafka_stream = KafkaUtils.createStream(stream,  # StreamingContext object
               'localhost:2181',                # Zookeeper quorum
               'my-test-group',                 # The group id for this consumer
               {'smartBuildings':1})            # Dict of (topic_name -> numPartitions) to consume
                                                # Each partition is consumed in its own thread
val_tup = kafka_stream.map(lambda x: x[1])
itemsStream = val_tup.map(lambda s: make_item(s))
encStream = itemsStream.map(lambda e: encrypt_record(e))

encStream.pprint()

stream.start() # start the streaming application
stream.awaitTermination()
예제 #56
0
    tokenizer = RegexpTokenizer(r'\w+')

    client = MongoClient()
    collection = client.streams.nba
    #collection.drop()
    print ("Before stream count is {}".format(collection.count()))
    client.close()

    #a new ssc needs to be started after a previous ssc is stopped
    ssc = StreamingContext(sc, PERIOD)

    #create stream receivers
    stream = KafkaUtils.createDirectStream(
              ssc,
              [TOPIC],
              {
                "metadata.broker.list": BROKERS,
              }
    )
    tweets = stream.map(lambda x: json.loads(x[1])).map(lambda x: json.loads(x))

    #filter commercials
    filtered_tweets = tweets.filter(lambda x: 'https' not in x['text'])
    # DataFrame operations inside your streaming program
    features = filtered_tweets.map(lambda x: {'id': x['id'], 'screen_name': x['user']['screen_name'], 'text': x['text'], 'followers': x['user']['followers_count'], 'created_at': x['created_at'], 'teams': find_teams(x['text'])})

    #tweets.pprint()
    features.pprint()

    #find trending topic
    filtered_tweets.foreachRDD(lambda x: find_trends(x['text']))
예제 #57
0
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").setAppName("Streaming")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 3)

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['test'], {"bootstrap.servers": 'localhost:9092'})

offsetRanges = []


def storeOffsetRanges(rdd):
    global offsetRanges
    offsetRanges = rdd.offsetRanges()
    return rdd


def printOffsetRanges(rdd):
    for o in offsetRanges:
        print(o.topic, o.partition, o.fromOffset, o.untilOffset)
        # TODO: 可以修改为更时尚的计数值、均值等数值指标


def storeRdd(rdd):
    global kafkaData
    # kafkaData = rdd.KafkaMessageAndMetadata()
    return rdd
예제 #58
0
        em(mail["mail_host"], mail["mail_user"],
           mail["mail_pass"]).send('*****@*****.**', rddstr)


def finds(word, keyword):
    #查找字符串
    word = word.lower()
    for i in keyword:
        n = word.count(i)
        if n > 0:
            return True
            break


if __name__ == '__main__':
    sc = SparkContext("local[2]")
    # 处理时间间隔为5s
    ssc = StreamingContext(sc, 5)
    # 打开一个TCP socket 地址 和 端口号
    lines = KafkaUtils.createStream(ssc, kfk['host'], kfk['group'], kfk['top'])
    lines1 = lines.map(lambda x: x[1]).map(
        lambda s: "".join(i for i in s if 31 < ord(i) < 127)).filter(
            lambda word: finds(word, keyword))  # 注意 取tuple下的第二个即为接收到的kafka流

    # #对数据进行存储
    lines1.foreachRDD(save1)

    ssc.start()
    ssc.awaitTerminationOrTimeout(90)
    ssc.stop()
예제 #59
0
        put_total_hits(event_id, total_hits)
        corrupted_count = 0
        for hit in hits:
            hit_id = hit['hit_id']
            corrupted_columns = verify_hit(hit)
            if len(corrupted_columns) != 0:
                corrupted_count += 1
                put_corrupted_columns(event_id, hit_id, corrupted_columns)
            put_hit(event_id, hit_id, hit)
        corrupted_count += get_corrupted_count(event_id)
        put_corrupted_count(event_id, corrupted_count)
        ac = get_collected(event_id)
        new_ac = ac + len(event['data'])
        put_collected(event_id, new_ac)
        if new_ac == event['total_hits'] and get_corrupted_count(event_id) == 0 :
            send_completion_msg_to_kafka_topic(event_id)


sparkContext = SparkContext.getOrCreate()
streamingContext = StreamingContext(sparkContext, 5)
sqlContext = SQLContext(sparkContext)


kafkaStream = KafkaUtils.createStream(streamingContext, 'sandbox.hortonworks.com:2181', 'defaultGroup', {'events': 1})
kafkaStream \
    .map(lambda event: json.loads(event[1].encode('utf-8'))) \
    .foreachRDD(put_in_hbase)


streamingContext.start()
예제 #60
0
    parser.add_argument('new_topic', help='new topic to send data to')

    # - get arguments
    args = parser.parse_args()
    kafka_brokers = args.kafka_brokers
    topic = args.topic
    new_topic = args.new_topic

    # - setup spark streaming utility
    conf = SparkConf() \
        .setMaster("local[2]") \
        .setAppName("StockAveragePrice")
    sc = SparkContext(conf=conf)
    sc.addFile('spark/stream-process.py')
    sc.setLogLevel('ERROR')
    ssc = StreamingContext(sc, 5)

    # - instantiate a kafka stream for processing
    kafka_stream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list': kafka_brokers})
    kafka_stream.foreachRDD(process)

    # - instantiate a simple kafka producer
    kafka_producer = KafkaProducer(bootstrap_servers=kafka_brokers.split(','))

    # - setup proper shutdown hook
    atexit.register(shutdown_hook, kafka_producer)

    # - start streaming processing
    ssc.start()
    ssc.awaitTermination()