def main(): # Create a local StreamingContext with two working thread and batch interval of 5 second sc = SparkContext("spark://ip-172-31-29-29:7077", "MyKafkaStream") # stream interval of 5 seconds ssc = StreamingContext(sc, 5) kafkaStream = KafkaUtils.createStream(ssc, "52.3.61.194:2181", "GroupNameDoesntMatter", {"parking_sensor_data": 2}) messages = kafkaStream.flatMap(lambda s: create_tuple(s[1])).reduceByKey(lambda a,b: (int(a)+int(b))/2) messages1 = messages.filter(lambda s: s[1] > 0) messages1.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def createContext(host, port, outputPath): # If you do not see this printed, that means the StreamingContext has been loaded # from the new checkpoint print "Creating new context" if os.path.exists(outputPath): os.remove(outputPath) sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount") ssc = StreamingContext(sc, 120) # Create a socket stream on target ip:port and count the # words in input stream of \n delimited text (eg. generated by 'nc') lines = ssc.socketTextStream(host, port) print '\n\n\nconnectionMade\n\n\n' addresses = lines.map(splitLine) transcationsum = addresses.map(lambda x: (x[0], (1, x[1]))).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) def echo(time, rdd): counts = "Counts at time %s %s" % (time, rdd.collect()) print counts print "Appending to " + os.path.abspath(outputPath) with open(outputPath, 'a') as f: f.write(counts + "\n") transcationsum.foreachRDD(echo) return ssc
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None): """Starts a Spark Streaming job from a Kafka input and parses message time WARNING!! This function only works for spark 1.4.0+ Args: brokers: the kafka broker that we look at for the topic topic: the kafka topic for input timeinterval: the time interval in seconds (int) that the job will bucket Returns: None """ sc = SparkContext(appName="PythonKafkaBucketCounter") ssc = StreamingContext(sc, timeinterval + 5) if valueDecoder: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder) else: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b) output_msg_func = output_msg(sc, ssc) interval_counts.foreachRDD(output_msg_func) ssc.start() ssc.awaitTermination()
class BaseStreamingTestCase(unittest.TestCase): """ From https://github.com/apache/spark/blob/ master/python/pyspark/streaming/tests.py """ timeout = 10 # seconds duration = .5 def setUp(self): self.ssc = StreamingContext(sc, self.duration) def tearDown(self): self.ssc.stop(False) def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _collect(self, dstream, n): result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) self.ssc.start() self.wait_for(result, n) return result
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "localhost:9092" topics = ['test'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) print(wordcounts) kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "192.192.0.27:9092" topics = ['topic7'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka") wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def createStreamingContext(): # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES) ssc = StreamingContext(sc, 2) # Create a DStream of raw data raw = ssc.socketTextStream(MASTER_IP, 9999) # Convert into models tweets = raw.map(lambda r: Tweet(raw_json=r)) # Store models tweets.foreachRDD(storeTweetsRDD) # Sliding window analysis window = tweets.window(20*60, 30) hashtagCounts = analysisHahtagCount(window) streamTop(hashtagCounts).pprint() # Keyword extraction - note tweets is immutable tweetsKeyword = tweets.map(lambda t: keywordExtraction(t)) # Update models tweetsKeyword.foreachRDD(updateTweetsRDD) # Sliding window analysis window2 = tweetsKeyword.window(20*60, 30) keywordCounts = analysisKeywordCount(window2) streamTop(keywordCounts).pprint() ssc.checkpoint(CHECKPOINT_DIR) return ssc
def main(): conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) # Create a streaming context with batch interval of 10 sec ssc.checkpoint("checkpoint") geolocator = Nominatim() stream(ssc,geolocator,100)
def main(): if len(sys.argv) != 4: print("Usage: kafka_wordcount.py <zk> <topic> <timeout>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) timeout = None if len(sys.argv) == 4: zk, topic, timeout = sys.argv[1:] timeout = int(timeout) else: zk, topic = sys.argv[1:] kvs = KafkaUtils.createStream( ssc, zk, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: (line.split(" ")) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a+b)) counts.pprint() kwargs = {} if timeout: kwargs['timeout'] = timeout ssc.start() ssc.awaitTermination(**kwargs)
def main(): sc = SparkContext(appName="IntrusionDetector") ssc = StreamingContext(sc, batch_durations) kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker}) kvs.foreachRDD(processRDD) ssc.start() ssc.awaitTermination()
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function): sc = SparkContext(appName=app_name) sqlContext = SQLContext(sc) # ssc = StreamingContext(sc, interval_seconds) ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) kvs.foreachRDD(sql_function) ssc.start() ssc.awaitTermination()
def main(): conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) # Create a streaming context with batch interval of 10 sec ssc.checkpoint("checkpoint") pwords = load_wordlist("positive.txt") nwords = load_wordlist("negative.txt") counts = stream(ssc, pwords, nwords, 100) make_plot(counts)
def read_tweets(): sc = SparkContext(appName="sentimentProducer") ssc = StreamingContext(sc,600) # Test 60 segundos brokers = "localhost:9092" kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers}) kvs.foreachRDD(create_format) producer.flush() ssc.start() ssc.awaitTermination()
def functionToCreateContext(): sc = SparkContext(appName="StreamingExampleWithKafka") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"} kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction) counts.pprint() return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(2) self.setupCalled = True return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def main(): conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) try: kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2}) kafka_streams.foreachRDD(process_rdd) except Exception as e: print e ssc.start() ssc.awaitTermination()
def invoke(): # object to keep track of offsets ConfigInitializer.basic_config() # app name application_name = "mon_metrics_kafka" my_spark_conf = SparkConf().setAppName(application_name) spark_context = SparkContext(conf=my_spark_conf) # read at the configured interval spark_streaming_context = \ StreamingContext(spark_context, cfg.CONF.service.stream_interval) kafka_stream = MonMetricsKafkaProcessor.get_kafka_stream( cfg.CONF.messaging.topic, spark_streaming_context) # transform to recordstore MonMetricsKafkaProcessor.transform_to_recordstore(kafka_stream) # catch interrupt, stop streaming context gracefully # signal.signal(signal.SIGINT, signal_handler) # start processing spark_streaming_context.start() # FIXME: stop spark context to relinquish resources # FIXME: specify cores, so as not to use all the resources on the cluster. # FIXME: HA deploy multiple masters, may be one on each control node try: # Wait for the Spark driver to "finish" spark_streaming_context.awaitTermination() except Exception as e: MonMetricsKafkaProcessor.log_debug( "Exception raised during Spark execution : " + str(e)) # One exception that can occur here is the result of the saved # kafka offsets being obsolete/out of range. Delete the saved # offsets to improve the chance of success on the next execution. # TODO(someone) prevent deleting all offsets for an application, # but just the latest revision MonMetricsKafkaProcessor.log_debug( "Deleting saved offsets for chance of success on next execution") MonMetricsKafkaProcessor.reset_kafka_offsets(application_name) # delete pre hourly processor offsets if cfg.CONF.stage_processors.pre_hourly_processor_enabled: PreHourlyProcessor.reset_kafka_offsets()
class MLLibStreamingTestCase(unittest.TestCase): def setUp(self): self.sc = sc self.ssc = StreamingContext(self.sc, 1.0) def tearDown(self): self.ssc.stop(False) @staticmethod def _ssc_wait(start_time, end_time, sleep_time): while time() - start_time < end_time: sleep(0.01)
def createContext(conf): spConf = conf.getSparkConf() sc = SparkContext(conf=spConf) ssc = StreamingContext(sc, conf.INTERVAL) ssc.remember(conf.REMEMBER) # get reader lines = conf.getReader(ssc) # use window lines = lines.window(conf.WINDOW, conf.WINDOW) lines = lines.map(lambda line: jsonDecode(line)) deal(lines, conf) return ssc
def createContext(): uBATCH_INTERVAL = 10 sc = SparkContext(SPARK_MASTER, appName="StreamingKafka") sc.broadcast(batchUserPostDict) sc.broadcast(batchPostUserDict) #sc = SparkContext("local[*]", appName="StreamingKafka") # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower ssc = StreamingContext(sc, uBATCH_INTERVAL) ssc.checkpoint(CHECKPOINT_DIR) # set checkpoint directory in HDFS #ssc.checkpoint(10 * uBATCH_INTERVAL) return ssc ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
def main(): conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set("spark.dynamicAllocation.enabled", "true") sc = SparkContext(conf = conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("checkpoint") # Clear the cassandra table init_cassandra().execute('TRUNCATE {}'.format(top_airports_table)) stream_kafka(ssc)
def main(): parser = OptionParser() parser.add_option('', '--enriched_data_path', action='store', dest='enriched_data_path', help='path to write enriched data') parser.add_option('', '--model_path', action='store', dest='model_path', help='path for model data') parser.add_option('', '--kafka_zookeeper_hosts', action='store', dest='kafka_zookeeper_hosts', help='list of Zookeeper hosts (host:port)') parser.add_option('', '--kafka_broker_list', action='store', dest='kafka_broker_list', help='list of Kafka brokers (host:port)') parser.add_option('', '--kafka_message_topic', action='store', dest='kafka_message_topic', help='topic to consume input messages from') parser.add_option('', '--kafka_alert_topic', action='store', dest='kafka_alert_topic', help='topic to produce alert messages to') parser.add_option('', '--kafka_enriched_data_topic', action='store', dest='kafka_enriched_data_topic', help='topic to produce enriched data to') parser.add_option('', '--streaming_batch_duration_sec', type='float', default=15.0, action='store', dest='streaming_batch_duration_sec', help='Streaming batch duration in seconds') parser.add_option('', '--max_batches', type='int', default=0, action='store', dest='max_batches', help='Number of batches to process (0 means forever)') options, args = parser.parse_args() sc = SparkContext() ssc = StreamingContext(sc, options.streaming_batch_duration_sec) sqlContext = getSqlContextInstance(sc) # Load saved model. model = None if options.model_path: model = RandomForestModel.load(sc, options.model_path) else: print('No model loaded.') # Create Kafka stream to receive new messages. kvs = KafkaUtils.createDirectStream(ssc, [options.kafka_message_topic], { 'metadata.broker.list': options.kafka_broker_list, 'group.id': 'spark_streaming_processor.py'}) # Take only the 2nd element of the tuple. messages = kvs.map(lambda x: x[1]) # Convert RDD of JSON strings to RDD of Rows. rows = messages.map(json_to_row) # Process messages. rows.foreachRDD(lambda time, rdd: process_messages(time, rdd, ssc=ssc, model=model, enriched_data_path=options.enriched_data_path, zookeeper_hosts=options.kafka_zookeeper_hosts, kafka_alert_topic=options.kafka_alert_topic, kafka_enriched_data_topic=options.kafka_enriched_data_topic, max_batches=options.max_batches)) ssc.start() ssc.awaitTermination()
def main(): sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) zkQuorum = "localhost:2181" topic = "twitter_raw" kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: pickle.loads(x[1].decode("utf-8"))["text"]) # fetch the text count = lines.map(lambda line: len(line.split())).reduce(add) # split into words and count count.foreachRDD(publishToRedis) # publish to redis count.pprint() ssc.start() ssc.awaitTermination()
def createContext(): conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g') sc = SparkContext(conf=conf) ssc = StreamingContext(sc, STREAMING_INTERVAL) lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL)) ssc.checkpoint(CHECKPOINT_DIR) # main split-combine-apply logic put here pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1)) runningCounts = pairs.updateStateByKey(updateFunction) sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
def start(self): sc = SparkContext(appName="PythonStreamingNOTHS") ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createStream(ssc, self.zkQuorum, "spark-streaming-consumer", {self.topic: 1}) print('******* Event received in window: ', kvs.pprint()) if topic == 'NOTHS-crawler-topic': kvs.foreachRDD(self.save_crawler_hbase) elif topic == 'NOTHS-trends-topic': kvs.foreachRDD(self.save_trends_hbase) ssc.start() ssc.awaitTermination()
class xStreamProcessor: ip = socket.gethostbyname(socket.gethostname()) port = 9999 dstream = None sc = None ssc = None #def __init__(self,ip=None,port=None,spark_master = 'spark://localhost:7077'): def __init__(self,ip=None,port=None,spark_master = 'mesos://10.0.2.85:5050'): if ip is not None: self.ip = ip if port is not None: self.port = port self.sc = SparkContext(master=spark_master,appName='StreamProcessor') self.ssc = StreamingContext(self.sc, 1) #self.ssc.checkpoint(directory=None) hiveContext = HiveContext(self.sc) hiveContext.sql('DROP TABLE IF EXISTS default.tweet_stream') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.tweet_stream (ip STRING, port STRING, date_time STRING, user STRING, msg STRING)') hiveContext.sql('DROP TABLE IF EXISTS default.email_stream') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_stream (ip STRING, port STRING, date_time STRING, \ fr STRING,to STRING, subject STRING, content STRING, subject_sentiment INT, content_sentiment INT, \ subject_power INT, content_power INT, subject_topic INT, content_topic INT, fraud_score DOUBLE)') hiveContext.sql('DROP TABLE IF EXISTS default.email_graph') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_graph (fr STRING,to STRING, dt STRING)') hiveContext.sql('DROP TABLE IF EXISTS default.trans_stream') hiveContext.sql('CREATE TABLE IF NOT EXISTS default.trans_stream (ip STRING,port STRING, date_time STRING, user STRING, amount DOUBLE, \ big_trans INT, is_in_odd_day INT, is_at_odd_time INT)') self.dstream = self.ssc.socketTextStream(self.ip, self.port) self.process_stream() self.ssc.start() self.ssc.awaitTermination() def process_stream(self): parts = self.dstream.flatMap(lambda line: line.split("|")) words = parts.map(lambda p: p[3]) pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint()
def main(): global ssc conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set('spark.streaming.stopGracefullyOnShutdown', True) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("/tmp/checkpoint") signal.signal(signal.SIGINT, stop_streaming) stream_kafka()
def createStreamingContext(): conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature") conf.set("spark.streaming.receiver.writeAheadLog.enable", "true") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("/tmp/spark-streaming-amqp") receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature") temperature = receiveStream.map(getTemperature) max = temperature.reduceByWindow(getMax, None, 5, 5) max.pprint() return ssc
def main(): brokers = 'localhost:9092' topic = 'openbmp.parsed.unicast_prefix' sc = SparkContext(appName='BGPPrefixOriginValidation') ssc = StreamingContext(sc,2) directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list':brokers}) #directKafkaStream.pprint() lines = directKafkaStream.flatMap(lambda x: x[1].splitlines()).filter(lambda line: line.startswith('add')) structured_rdd = lines.map(structure_data) structured_rdd.foreachRDD(lambda rdd: rdd.foreachPartition(validate_bgp_prefix)) ssc.start() ssc.awaitTermination()
break string = ",".join(Hlist) print(string) #Hlist=HList[0:-1] if __name__ == "__main__": window_size, batch_size = int(sys.argv[1]), int(sys.argv[2]) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, int(batch_size)) # ssc.checkpoint("/home/cdiya/Downloads/checkpoints") ssc.checkpoint("/checkpoint_BIGDATA") lines = ssc.socketTextStream("localhost", 9009) lines = lines.window(int(window_size),1) # lines.pprint() words = lines.map(lambda line: line.split(";")[7]) # words.pprint() words = words.flatMap(lambda x: x.split(",")) # words.pprint() hashtag = words.map(lambda x: (x,1)) # hashtag.pprint() #hashtag = hashtag.rdd
# Note: credentials will be pulled from IAM role assigned to EMR nodes. Make sure permissions are set properly for access to your Kinesis stream # define variables s3_target_bucket_name = 'mattsona-spark-demo' # replace with your bucket name for target data aws_region = 'us-west-2' # replace w/ AWS region used for Kinesis stream kinesis_stream = 'spark_streaming_kinesis_demo' # replace with your Kinesis stream name kinesis_endpoint = 'https://kinesis.' + aws_region + '.amazonaws.com' # the public endpiont of the AWS region this is executed from kinesis_app_name = 'alex_test_app' # app name used to track process through the Kinesis stream kinesis_initial_position = InitialPositionInStream.LATEST # InitialPositionInStream.TRIM_HORIZON | InitialPositionInStream.LATEST kinesis_checkpoint_interval = 10 # define how long to checkpoint when processing through the Kinesis stream spark_batch_interval = 10 # how many seconds before pulling the next batch of data from the Kinesis stream # configure spark elements spark_context = SparkContext(appName=kinesis_app_name) # spark_streaming_context = StreamingContext(sc, 1) # sc valid for running in pyspark interactive mode spark_streaming_context = StreamingContext(spark_context, spark_batch_interval) kinesis_stream = KinesisUtils.createStream( spark_streaming_context, kinesis_app_name, kinesis_stream, kinesis_endpoint, aws_region, kinesis_initial_position, kinesis_checkpoint_interval ) # previous example had ', StorageLevel.MEMORY_AND_DISK_2' at the end of the call # take kinesis stream JSON data and convert to CSV # just realized we're still dealing with dstreams, not RDD, so naming is inaccurate py_dict_rdd = kinesis_stream.map(lambda x: json.loads(x)) # need to convert int (time_stamp & random_int) to string csv_rdd = py_dict_rdd.map(lambda x: x['user_name'] + ',' + str( datetime.datetime.utcfromtimestamp(x['time_stamp'])) + ',' + x[ 'data_string'] + ',' + str(x['random_int'])) # save that rdd to S3
empty_intervals = sc.accumulator(0) images = sc.accumulator(0) correct_preds_tot = sc.accumulator(0) # Load model trained using BDL_KERAS_CIFAR_CNN.py model = Model.loadModel(model_defs_path, model_weights_path) print('%s.%03dZ: Loaded trained model definitions %s and weights %s' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time() * 1000) % 1000, model_defs_path, model_weights_path)) print( '%s.%03dZ: Starting reading streaming data from %s:%d at interval %s seconds' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time() * 1000) % 1000, IP_address, port, reporting_interval)) # Initialize StreamingContext, have it read TextStream through socket ssc = StreamingContext(sc, reporting_interval) image_stream = ssc.socketTextStream(IP_address, port) # Run model on each batch image_stream.foreachRDD(run_model) # Start reading streaming data ssc.start() start_time = time() ssc.awaitTermination() elapsed_time = time( ) - start_time - empty_intervals.value * reporting_interval - 2.4 # Subtract empty intervals and time to shut down stream print( '\n%s.%03dZ: %d images received in %.1f seconds (%d intervals), or %.0f images/second Correct predictions: %d Pct correct: %.1f' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time() * 1000) % 1000, images.value, elapsed_time, interval.value,
def StreamingInit_Old(self): # 老版的Streaming没啥内容。就一个套路,实例化一个SparkSteam对象。textFile或者stock拿数据。增量。 self.SpkStream = StreamingContext(self.SpkCont, Config.BATCHDUR)
# ascii_encode = lambda x: x.encode('ascii') # return dict(map(ascii_encode, pair) for pair in data.items()) def helper(data): return data.encode('ascii') def enc(data): result = {k: helper(v) for k, v in data.items()} return result #return dict(map(lambda line: line.encode('ascii'), pair.value) for pair in data.items()) conf = SparkConf().setMaster("local[*]").setAppName("StreamingDirectKafka") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) skQuorum = "localhost:2181" topic = ["meetup"] kafkaParams = {"metadata.broker.list": "localhost:9092"} #, kafkaParams = {"metadata.broker.list":"localhost:9092"} kafkaStream = KafkaUtils.createDirectStream(ssc, topic, kafkaParams) #stream = ssc.receiverStream( \ # MeetupReceiver("https://stream.meetup.com/2/rsvps") \ #) """ data = kafkaStream.map(lambda line: json.loads(line) """ rsvp = kafkaStream.map(lambda line: line[1]) rsvp2 = rsvp.map(lambda line: json.loads(line.encode("ascii", "ignore")))
sc.setLogLevel('ERROR') codes = sc.parallelize([(1, 'alpha'), (2, 'beta'), (3, 'delta'), (4, 'gamma')]) codes = getSparkSessionInstance(config).createDataFrame(codes, schema = 'id:int, name:string') codes.createOrReplaceTempView('codes') print(codes.collect()) def process(time, rdd): try: spark = getSparkSessionInstance(rdd.context.getConf()) rdd1 = rdd.map(lambda x : x[1].split(',')) \ .map(lambda x : (int(x[0]), float(x[1]))) df = spark.createDataFrame(rdd1, schema='id:int, amount:float') df.createOrReplaceTempView('newdata') join = spark.sql('select n.id, c.name, n.amount from newdata as n join codes as c on n.id = c.id') join.show() except: print(rdd.collect()) ssc = StreamingContext(sc, 5) kafkaStream = KafkaUtils.createStream(ssc, '127.0.0.1:2181', 'spark-streaming', {'classroom':1}) #kafkaStream.pprint() kafkaStream.foreachRDD(process) ssc.start() ssc.awaitTerminationOrTimeout(10000) ssc.stop()
from pyspark import SparkContext from pyspark.streaming import StreamingContext sc = SparkContext(master='local[4]') ssc = StreamingContext(sc, 5) sts = ssc.socketTextStream('localhost', 9999) fm = sts.flatMap(lambda x: x.split(' ')).map(lambda y: (y, 1)).reduceByKey( lambda x, y: x + y) fm.pprint() ssc.start() ssc.awaitTermination() # Hello pyspark streaming
# Extract words tokenizer = Tokenizer().setInputCol("message").setOutputCol("words") # Remove custom stopwords stopwords = StopWordsRemover().getStopWords() + ["-"] remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered") # create features hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pipeline = Pipeline().setStages([tokenizer, remover, hashingTF]) # transform train and test streams featured = pipeline.fit(df).transform(df) featured_test = pipeline.fit(df_test).transform(df_test) ########################################### ssc = StreamingContext(sc, 1) # read Dstream from json files in monitored dir for training trainingData = ssc.textFileStream(get_hdfs_filepath(file_name="train_stream_json/")).map(parse_json_line) trainingData.pprint() # read Dstream from json files in monitored dir for prediction testData = ssc.textFileStream(get_hdfs_filepath(file_name="test_stream_json/")).map(parse_json_line) testData.pprint() numFeatures = 10 # initialize a StreamingLinearRegression model model = StreamingLinearRegressionWithSGD() model.setInitialWeights([0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0]) # train the model on training Dstream model.trainOn(trainingData)
def extract_url_request(line): exp = pattern.match(line) if exp: request = exp.groupdict()["request"] if request: request_fields = request.split() if len(request_fields) > 1: return request_fields[1] if __name__ == "__main__": sc = SparkContext(appName="StreamingFlumeLogAggregator") sc.setLogLevel("ERROR") batch_interval_s = 1 ssc = StreamingContext(sc, batch_interval_s) flumeStream = FlumeUtils.createStream(ssc, "localhost", 9092) lines = flumeStream.map(lambda x: x[1]) urls = lines.map(extract_url_request) # Reduce by URL over a 5-minute window sliding every second # Reduce: Count for each distinct URL window_interval = 300 slide_interval = 1 url_counts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow( lambda x, y: x + y, lambda x, y: x - y, window_interval, slide_interval) # Sort and print the results
from pyspark.sql import types import json import csv from json import loads from flatten_json import flatten from time import sleep # import pandas as pd print("PROGRAM START!!!") print("PROGRAM START!!!") print("PROGRAM START!!!") print("PROGRAM START!!!") sc= SparkContext() ssc = StreamingContext(sc, 10) sqlc= SQLContext(sc) directKafkaStream = KafkaUtils.createDirectStream(ssc, ["kafkaNBA"], {"metadata.broker.list": "localhost:9099"}) lines= directKafkaStream.map(lambda x: x[1]) print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") def transformer(rdd): my_obj= json.loads(rdd) return (my_obj["player"]["weight_pounds"]) transform= lines.map(transformer)
from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import Row, SQLContext import sys import requests # create spark configuration conf = SparkConf() conf.setAppName("TwitterStreamApp") # create spark context with the above configuration sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # create the Streaming Context from the above spark context with interval size 2 seconds ssc = StreamingContext(sc, 2) # setting a checkpoint to allow RDD recovery ssc.checkpoint("checkpoint_TwitterApp") # read data from port 9009 dataStream = ssc.socketTextStream("localhost", 9009) def aggregate_tags_count(new_values, total_sum): return sum(new_values) + (total_sum or 0) def get_sql_context_instance(spark_context): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(spark_context) return globals()['sqlContextSingletonInstance'] def process_rdd(time, rdd): print("----------- %s -----------" % str(time))
from pyspark.streaming import StreamingContext dvc = [[-0.1, -0.1], [0.1, 0.1], [1.1, 1.1], [0.75, 0.75], [0.9, 0.9]] dvc = [sc.parallelize(i, 1) for i in dvc] ssc = StreamingContext(sc, 2.0) input_stream = ssc.queueStream(dvc) def get_output(rdd): rdd_data = rdd.collect() if 0.75 in rdd_data: print "Ending marker found", rdd_data ssc.stop() else: print "Not found ending marker. Continuing" print rdd_data input_stream.foreachRDD(get_output) ssc.start()
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == '__main__': sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) kstream = KafkaUtils.createDirectStream(ssc, topics = ['CodeSubmission'], \ kafkaParams = {"metadata.broker.list": '52.53.157.26:9092'}) data = kstream.map(lambda x: x[1].encode("utf-8")) data.pprint() ssc.start() ssc.awaitTerminationOrTimeout(30) ssc.stop(stopGraceFully=True)
conf.setAppName("Spark Streaming Examples") ## Initialize SparkContext. Run only once. Otherwise you get multiple #Context Error. #for streaming, create a spark context with 2 threads. sc = SparkContext('local[4]', conf=conf) from pyspark.streaming import StreamingContext #............................................................................ ## Streaming with TCP/IP data #............................................................................ #Create streaming context with latency of 1 streamContext = StreamingContext(sc, 3) totalLines = 0 lines = streamContext.socketTextStream("localhost", 9000) #Word count within RDD words = lines.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) wordCounts.pprint(5) #Count lines totalLines = 0 linesCount = 0
print(top[1][0] + str(',') + top[2][0] + str(',') + top[3][0] + str(',') + top[4][0] + str(',') + top[5][0]) WindowSize = int(sys.argv[1]) BatchDuration = int( sys.argv[2] ) #pass window size and batch duration as command line arguments #print(WindowSize, BatchDuration) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, BatchDuration) #passing batch duration ssc.checkpoint("/checkpoint_BIGDATA") socket_stream = ssc.socketTextStream("localhost", 9009) #stream the lines lines = socket_stream.window(WindowSize) cols = lines.flatMap(lambda line: [line.split(";")]) #split csv line into cols #count=cols.reduceByKey(lambda x,y:x+y) #cols.pprint() hashtags = cols.flatMap( lambda col: col[7].split(",")) #split hashtag col into hashtags hashtag_pairs = hashtags.map(lambda hashtag: (hashtag, 1)) #make (hashtag, 1) tuple
def __init__(self): self.ssc = StreamingContext(sc, 1)
def hasht(x): #parts=x.split(',') parts=filter(None,x.split(',')) for i in parts: return i if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: pagerank <file> <Window Size> <Batch Duration>", file=sys.stderr) sys.exit(-1) window_size=int(sys.argv[1]) batch_durn=int(sys.argv[2]) conf=SparkConf() conf.setAppName("BigData") sc=SparkContext(conf=conf) ssc=StreamingContext(sc,batch_durn) ssc.checkpoint("~/checkpoint_BIGDATA") dataStream=ssc.socketTextStream("localhost",9009) # dataStream.pprint() tweet=dataStream.map(lambda x:tmp(x)) #tweet.pprint() tweet=tweet.map(lambda x:hasht(x)).filter(lambda x:x!=None) #tweet.pprint() totalcount=tweet.countByValueAndWindow(window_size,1) #totalcount.pprint() #To Perform operation on each RDD totalcount.foreachRDD(process_rdd) ssc.start()
from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext conf = SparkConf() conf.set("spark.master", "yarn") conf.set("spark.app.name", "streamingapp") sc = SparkContext(conf=conf) streamc = StreamingContext(sc, batchDuration=15) r1 = sc.textFile("s3://datasets-spark-learning/flat_files/au-500.csv") ds1 = streamc.textFileStream( "s3://datasets-spark-learning/flat_files/csvfiles/") ds2 = ds1.transform(lambda rdd: rdd.union(r1).map(lambda x: x + "spark")) ds2.pprint() streamc.start() streamc.awaitTermination()
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext def updateFunc (new_values, last_sum): return sum(new_values) + (last_sum or 0) sc = SparkContext(appName="PyStreamNWC", master="local[*]") ssc = StreamingContext(sc, 5) ssc.checkpoint("checkpoint") lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .updateStateByKey(updateFunc) \ .transform(lambda x: x.sortByKey()) counts.pprint() ssc.start() ssc.awaitTermination()
"rank": rank, "origin": origin, "destination": dest, "airlineid": airlineid, "airline": airline_lookup.value[str(airlineid)], "arrdelay": arrdelay }) # Use LOWER characters carriersByPath.saveToCassandra("capstone", "carriersbypath") #main function if __name__ == "__main__": # Configure Spark. Create a new context or restore from checkpoint ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, functionToCreateContext) # get this spark context sc = ssc.sparkContext # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes sc.addPyFile("common.py") # Create a Transformed DStream. Read Kafka from first offset # creating a stream # :param ssc: StreamingContext object # :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..). # :param groupId: The group id for this consumer. # :param topics: Dict of (topic_name -> numPartitions) to consume. # Each partition is consumed in its own thread. # :param kafkaParams: Additional params for Kafka
def updateFunction(newValues, runningCount): current = (sum(newValues), len(newValues)) if not runningCount: runningCount = current else: runningCount = (runningCount[0] + current[0], runningCount[1] + current[1]) return runningCount if __name__ == '__main__': # set up sc = SparkContext(appName="q22") ssc = StreamingContext(sc, TimeOut) brokers = BootStarpServers topic = TopicName sc.setLogLevel("WARN") ssc.checkpoint("/tmp/q22") kvs = KafkaUtils.createDirectStream(ssc, [topic], KafkaParams) # key logic def processRDD(rdd): print("start processing rdd...") rdd.foreachPartition(save_to_dynamoDB) print("rdd processed...") print("-----------------------------------------------") def save_to_dynamoDB(partition):
from pyspark.streaming.kafka import KafkaUtils os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3" os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3" os.environ['PYSPARK_SUBMIT_ARGS'] = \ '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \ 'pyspark-shell' spark = SparkSession\ .builder\ .appName("word_count")\ .master("local[*]")\ .getOrCreate() sc = spark.sparkContext ssc = StreamingContext(sparkContext=sc, batchDuration=1) # the topic to subscribe topic_to_sub = ["test"] # the address of kafka, separate with comma if there are many bootstrap_servers = "localhost:9092" # kafka config info kafka_params = {"metadata.broker.list": bootstrap_servers} # initialize stream to consume data from kafka kafka_stream = KafkaUtils.createDirectStream(ssc=ssc, topics=topic_to_sub, kafkaParams=kafka_params) kafka_stream.pprint() r = redis.Redis("127.0.0.1")
stream = KafkaUtils.createDirectStream(ssc, TOPICS, kafkaParams, offsets) stream.foreachRDD(process) ssc.checkpoint(CHECKPOINT) return ssc parser = argparse.ArgumentParser() parser.add_argument('topic', help="Tópico Kafka") args = parser.parse_args() if args.topic is None: parser.error("Es necesario especificar un tópico kafka!") sys.exit(1) # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' TOPICS = [args.topic] GROUP_ID = 'group.1' APP_NAME = 'TwitterStreamML' CHECKPOINT = '/tmp/%s' % APP_NAME STREAM_CONTEXT_TIMEOUT = 70 if __name__ == "__main__": context = StreamingContext.getOrCreate(CHECKPOINT, functionToCreateContext) context.start() context.awaitTermination(timeout=STREAM_CONTEXT_TIMEOUT) context.stop()
def streaming(sc, reload: int = 5): from pyspark.streaming import StreamingContext ssc = StreamingContext(sc, reload) return ssc
Counts words in UTF8 encoded, '\n' delimted text received from the network every second. Usage: network_wordcount.py <hostname> <port> To run this on your local machine, you need to first run a NetCat server `$ netcat -l -p 9999` and then run the example `$ $SPAKR_HOME/bin/spark-submit network_wordcount.py localhost 9999` """ from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr) exit(-1) sc = SparkContext("local[2]", "NetworkWordCount") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 1) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
(9453, 3586, 1200548), (-6172, 7805, 264695806), (-75, -98, 177749096), (-1481, -9319, 515114864), (1538, -4513, 459014124), (-427, 3347, 105394153), (9563, 6773, 346529937), (-9007, 5215, 833170048), (-5316, -3153, 701832096), (-7881, 6554, 211851653), (8047, 5316, 653508160), (3615, -969, 408839209), (1678, -5874, 964232482), (-9603, -7771, 612835737), (-2613, -7682, 999683604), (-3867, -206, 73595183), (-4841, -1371, 4259718), (-2310, 3912, 775274868), (7567, 9614, 646354995), (-8238, 8253, 844226086), (4501, -1611, 498009778), (9240, 2000, 694905063), (7650, 4727, 68326721), (6351, 1386, 280839009), (-6909, 3520, 957821259), (-1581, -8095, 885523760), (1090, 5516, 254267011), (4288, -7581, 325047909), (-4262, 7348, 3784554), (-7613, -3920, 724353002), (-384, -2708, 395489622), (-8840, 4115, 303185341), (6212, -1195, 991066480), (1213, 4812, 498566989), (-640, -7705, 182088090), (-4553, 5934, 452918094), (2513, 6315, 355348464), (3426, 1234, 304757776)] ssc = StreamingContext(sc, batch_dur) data = ssc.socketTextStream("localhost", port).map(lambda x:json.loads(x)["city"]).transform(lambda x:x.distinct()).window(window_length,sliding_interval).transform(lambda x:x.distinct())\ .map(hashtobit).flatMap(lambda x:x).groupByKey().foreachRDD(combine)#count() #inputStream.pprint(10) with open(sys.argv[2], 'w') as file: file.write("Time,Ground Truth,Estimation\n") #print(hash_tables) ssc.start() ssc.awaitTermination()
def forf(x): for i in x: yield (i, 1) def rddprint(rdd): print(",".join(rdd.take(5))) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("/checkpoint_BIGDATA") #Try in outpu1 inputStream = ssc.socketTextStream("localhost", 9009) dataStream = inputStream.window(int(sys.argv[1]), int(sys.argv[2])) tweet = dataStream.map(tmp) septweet = tweet.flatMap(forf) count = septweet.reduceByKey(lambda x, y: x + y) sortcount = count.transform( lambda rdd: rdd.sortBy(lambda a: a[0], ascending=True)) sortcount1 = sortcount.transform( lambda rdd: rdd.sortBy(lambda a: a[1], ascending=False)) tweet1 = sortcount1.filter(lambda w: w[0] is not '') #tweet1.pprint() res = tweet1.map(lambda a: a[0])
# return to the pool for future reuse # ConnectionPool.returnConnection(connection) # To Run: # sudo $SPARK_HOME/bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 kafka-spark-test.py if __name__ == "__main__": # To run on cluster: # conf = SparkConf().setAppName("Venmo-Graph-Analytics-Dev").setMaster("spark://ip-172-31-0-135:7077") # sc = SparkContext(conf=conf) # To run locally: sc = SparkContext(appName="Venmo-Graph-Analytics-Dev") # Set up resources ssc = StreamingContext(sc, 1) # Set Spark Streaming context # brokers = "ec2-50-112-19-115.us-west-2.compute.amazonaws.com:9092,ec2-52-33-162-7.us-west-2.compute.amazonaws.com:9092,ec2-52-89-43-209.us-west-2.compute.amazonaws.com:9092" brokers = "ec2-52-25-139-222.us-west-2.compute.amazonaws.com:9092" topic = 'Venmo-Transactions-Dev' kafka_stream = KafkaUtils.createDirectStream( ssc, [topic], {"metadata.broker.list": brokers}) transaction = kafka_stream.map(lambda kafka_response: json.loads(kafka_response[1]))\ .map(lambda json_body: extract_data(json_body))\ .foreachRDD(lambda rdd: rdd.foreachPartition(send_partition)) # transaction.pprint() ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext, Row def getSqlContextInstance(sparkContext): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) return globals()['sqlContextSingletonInstance'] if __name__ == "__main__": sc = SparkContext() ssc = StreamingContext(sc, 5) # Create a socket stream on target ip:port and count the # words in input stream of \n delimited text (eg. generated by 'nc') lines = ssc.socketTextStream("localhost", 9999) words = lines.flatMap(lambda line: line.split(" ")) # Convert RDDs of the words DStream to DataFrame and run SQL query def process(time, rdd): print("========= %s =========" % str(time)) try: # Get the singleton instance of SQLContext sqlContext = getSqlContextInstance(rdd.context) # Convert RDD[String] to RDD[Row] to DataFrame rowRdd = rdd.map(lambda w: Row(word=w))
# Spark Streaming vs. Structured Streaming https://dzone.com/articles/spark-streaming-vs-structured-streaming """ 核心组件 pyspark.streaming.StreamingContext Main entry point for Spark Streaming functionality. pyspark.streaming.DStream A Discretized Stream (DStream), the basic abstraction in Spark Streaming. """ import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": sc = SparkContext(appName="Streaming ") ssc = StreamingContext(sc, 1) lines = ssc.textFileStream("03_pyspark.streaming.py") # return DStream counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda x: (x, 1))\ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()