def test_kinesis_stream_api(self): # Don't start the StreamingContext because we cannot test it in Jenkins KinesisUtils.createStream( self.ssc, "myAppNam", "mySparkStream", "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, 2, MetricsLevel.DETAILED, StorageLevel.MEMORY_AND_DISK_2, ) KinesisUtils.createStream( self.ssc, "myAppNam", "mySparkStream", "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, 2, MetricsLevel.DETAILED, StorageLevel.MEMORY_AND_DISK_2, "awsAccessKey", "awsSecretKey", )
def test_kinesis_stream_api(self): # Don't start the StreamingContext because we cannot test it in Jenkins kinesisStream1 = KinesisUtils.createStream( self.ssc, "myAppNam", "mySparkStream", "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2) kinesisStream2 = KinesisUtils.createStream( self.ssc, "myAppNam", "mySparkStream", "https://kinesis.us-west-2.amazonaws.com", "us-west-2", InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2, "awsAccessKey", "awsSecretKey")
def consume_records(interval=1, StreamName=None, region_name='us-west-2', Bucket=None): """ Create a local StreamingContext with two working thread and batch interval """ assert StreamName is not None endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name) #client = boto3.client('s3') #client.upload_file ('kinesis_event_consumer.py',Bucket,'kinesis_event_consumer.py') #print ('fichero subido') sc, stream_context = initialize_context(interval=interval) sc.setLogLevel("ERROR") print('create stream') stream = KinesisUtils.createStream(stream_context, 'EventLKinesisConsumer', StreamName, endpoint, region_name, InitialPositionInStream.TRIM_HORIZON, interval) #LATEST # counts number of events event_counts = aggregate_by_event_type(stream) global_counts = update_global_event_counts(event_counts) global_counts.pprint() # Sends data to S3 global_counts.foreachRDD(lambda rdd: send_record(rdd, Bucket)) stream_context.start() print('stream iniciado') stream_context.awaitTermination() stream_context.stop() sc.stop()
def consume_records(interval=1, StreamName=None, region_name='us-west-2', Bucket=None): """ Create a local StreamingContext with two working thread and batch interval """ assert StreamName is not None endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name) sc, stream_context = initialize_context(interval=interval) sc.setLogLevel("INFO") stream = KinesisUtils.createStream(stream_context, 'EventLKinesisConsumer', StreamName, endpoint, region_name, InitialPositionInStream.LATEST, interval) # counts number of events event_counts = aggregate_by_event_type(stream) global_counts = update_global_event_counts(event_counts) global_counts.pprint() # Sends data to S3 global_counts.foreachRDD(lambda rdd: send_record(rdd, Bucket)) stream_context.start() stream_context.awaitTermination()
def test_kinesis_stream(self): if not are_kinesis_tests_enabled: sys.stderr.write( "Skipped test_kinesis_stream (enable by setting environment variable %s=1" % kinesis_test_environ_var ) return import random kinesisAppName = "KinesisStreamTests-%d" % abs(random.randint(0, 10000000)) kinesisTestUtilsClz = ( self.sc._jvm.java.lang.Thread.currentThread() .getContextClassLoader() .loadClass("org.apache.spark.streaming.kinesis.KinesisTestUtils") ) kinesisTestUtils = kinesisTestUtilsClz.newInstance() try: kinesisTestUtils.createStream() aWSCredentials = kinesisTestUtils.getAWSCredentials() stream = KinesisUtils.createStream( self.ssc, kinesisAppName, kinesisTestUtils.streamName(), kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(), InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY, aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey(), ) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) stream.foreachRDD(get_output) self.ssc.start() testData = [i for i in range(1, 11)] expectedOutput = set([str(i) for i in testData]) start_time = time.time() while time.time() - start_time < 120: kinesisTestUtils.pushData(testData) if expectedOutput == set(outputBuffer): break time.sleep(10) self.assertEqual(expectedOutput, set(outputBuffer)) except: import traceback traceback.print_exc() raise finally: self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
def main(appName, streamName, endpointUrl, regionName): sc = SparkContext(appName="BestApp") ssc = StreamingContext(sc, 10) data = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 10) result = data.window(60, 20).foreachRDD(computeGridVal) ssc.start() ssc.awaitTermination()
def run(self, appName, streamName, endpointUrl, region_name, anomaly_stream_name): sc = SparkContext(appName="PythonStreamingKinesisAnomalyDetection") print("Initialised SC") #TODO: log warn and above only logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) ssc = StreamingContext(sc, 1) dstreamRecords = KinesisUtils.createStream( ssc, appName, streamName, endpointUrl, region_name, InitialPositionInStream.LATEST, 2) CloudTrailLogProcessor( anomaly_stream_name=anomaly_stream_name, region=region_name)\ .process(sc, ssc, dstreamRecords) ssc.start() ssc.awaitTermination()
def test_kinesis_stream(self): import random kinesisAppName = "KinesisStreamTests-%d" % abs( random.randint(0, 10000000)) kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils( 2) try: kinesisTestUtils.createStream() aWSCredentials = kinesisTestUtils.getAWSCredentials() stream = KinesisUtils.createStream( self.ssc, kinesisAppName, kinesisTestUtils.streamName(), kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(), InitialPositionInStream.LATEST, 10, MetricsLevel.DETAILED, StorageLevel.MEMORY_ONLY, aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey(), ) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) stream.foreachRDD(get_output) self.ssc.start() testData = [i for i in range(1, 11)] expectedOutput = set([str(i) for i in testData]) start_time = time.time() while time.time() - start_time < 120: kinesisTestUtils.pushData(testData) if expectedOutput == set(outputBuffer): break time.sleep(10) self.assertEqual(expectedOutput, set(outputBuffer)) except BaseException: import traceback traceback.print_exc() raise finally: self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
def test_kinesis_stream(self): if not are_kinesis_tests_enabled: sys.stderr.write( "Skipped test_kinesis_stream (enable by setting environment variable %s=1" % kinesis_test_environ_var) return import random kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000))) kinesisTestUtilsClz = \ self.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.kinesis.KinesisTestUtils") kinesisTestUtils = kinesisTestUtilsClz.newInstance() try: kinesisTestUtils.createStream() aWSCredentials = kinesisTestUtils.getAWSCredentials() stream = KinesisUtils.createStream( self.ssc, kinesisAppName, kinesisTestUtils.streamName(), kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(), InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY, aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey()) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) stream.foreachRDD(get_output) self.ssc.start() testData = [i for i in range(1, 11)] expectedOutput = set([str(i) for i in testData]) start_time = time.time() while time.time() - start_time < 120: kinesisTestUtils.pushData(testData) if expectedOutput == set(outputBuffer): break time.sleep(10) self.assertEqual(expectedOutput, set(outputBuffer)) except: import traceback traceback.print_exc() raise finally: self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
def creatingfunc(): # create streaming context ssc = StreamingContext(sc, batchIntervalSeconds) LogToKinesis("creatingfunc", "StreamingContext", str(dir(ssc))) ssc.remember(10 * batchIntervalSeconds) # setup streams try: #paxRecords = ssc.textFileStream(SOURCE).map(ParsePassengerRecord) # parse and enrich pax data kinesisStream = KinesisUtils.createStream( ssc, KINESIS_APPNAME, KINESIS_STREAM, KINESIS_ENDPOINT_URL, KINESIS_REGION, InitialPositionInStream.TRIM_HORIZON, 10, StorageLevel.MEMORY_AND_DISK_2, ACCESS_KEY, SECRET_KEY) LogToKinesis("kinesisStream", "KinesisUtils.createStream", str(dir(kinesisStream))) # track total boarding and alighting per train/ownmoduleno # Note: rdd returned by updateStateByKey is (ownmoduleno, (alight, board)) # for easy conversion to dataframe we map this rdd to (ownmoduleno, alight, board). (Not shure why the following did not work: map(lambda k,v: (k,v[0],v[1])) ) """ noOfPassengersOwnModuleToday = paxRecords.map(lambda record: (record[OWN_MODULE_NO],(record[TOTAL_ALIGHTING], record[TOTAL_BOARDING]))) \ .updateStateByKey(updatePassengerCount) \ .map(lambda v: (v[0],v[1][0],v[1][1])) paxRecordsWindowStationLine = paxRecords.window(1800,20) # compute aggregates on a 30 min window updated every 20 sec paxRecordsTable = paxRecords.window(900,900) # save to permanent storage every 15 min (how large/small amounts of data is optimal to save at a time?) LogToKinesis("creatingfunc", "Streams set up OK") """ except Exception as e: LogToKinesis("creatingfunc", "EXCEPTION", str(e)) # output streams try: #paxRecords.foreachRDD(processPax) #noOfPassengersOwnModuleToday.foreachRDD(processOwnModuleState) # send sum of alightings and boardings and pax present onboard for each train to Kinesis #paxRecordsWindowStationLine.foreachRDD(processStationLineWindow) #send aggregates to Kinesis periodically, i.e. last 30 mins updated every 20 secs #paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically kinesisStream.foreachRDD(processKinesisPax) except Exception as e: LogToKinesis("mainLoop", "EXCEPTION", str(e)) ssc.checkpoint(CHECKPOINTDIR) return ssc
def run(): APP_NAME = 'kinesis-stream-test' STREAM_NAME = 'MY-TEST-STREAM' ENDPOINT_URL = 'https://kinesis.us-east-1.amazonaws.com' REGION = 'us-east-1' # The time interval to get a new RDD in seconds batchInterval = 5 kinesisCheckpointInterval = batchInterval sc = SparkContext(appName=APP_NAME) sc.setLogLevel('ERROR') ssc = StreamingContext(sc, batchInterval) stream = KinesisUtils.createStream( ssc=ssc, kinesisAppName=APP_NAME, streamName=STREAM_NAME, endpointUrl=ENDPOINT_URL, regionName=REGION, initialPositionInStream=InitialPositionInStream.LATEST, checkpointInterval=kinesisCheckpointInterval, storageLevel=StorageLevel.MEMORY_AND_DISK_2, ) def get_output(_, rdd): if (len(rdd.take(1)) == 0): return print('New RDD is coming ...') data = rdd.collect() for e in data: print(e) print(f'Data entry count = {len(data)}') stream.foreachRDD(get_output) ssc.start() ssc.awaitTermination()
def consume_records( interval=1, StreamName=None, region_name='us-west-2', port=9876): """ Create a local StreamingContext with two working thread and batch interval """ assert StreamName is not None endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name) sc, stream_context = initialize_context(interval=interval) sc.setLogLevel("INFO") kinesis_stream = KinesisUtils.createStream( stream_context, 'EventLKinesisConsumer', StreamName, endpoint, region_name, InitialPositionInStream.LATEST, interval) tcp_stream = stream_context.socketTextStream('localhost', port) join_aggregation(kinesis_stream, tcp_stream) stream_context.start() stream_context.awaitTermination()
def creatingfunc(): # create streaming context ssc = StreamingContext(sc, batchIntervalSeconds) LogToKinesis("creatingfunc", "StreamingContext", str(dir(ssc))) ssc.remember(10*batchIntervalSeconds) # setup streams try: #paxRecords = ssc.textFileStream(SOURCE).map(ParsePassengerRecord) # parse and enrich pax data kinesisStream = KinesisUtils.createStream(ssc, KINESIS_APPNAME, KINESIS_STREAM, KINESIS_ENDPOINT_URL, KINESIS_REGION, InitialPositionInStream.TRIM_HORIZON, 10, StorageLevel.MEMORY_AND_DISK_2, ACCESS_KEY, SECRET_KEY) LogToKinesis("kinesisStream", "KinesisUtils.createStream", str(dir(kinesisStream))) # track total boarding and alighting per train/ownmoduleno # Note: rdd returned by updateStateByKey is (ownmoduleno, (alight, board)) # for easy conversion to dataframe we map this rdd to (ownmoduleno, alight, board). (Not shure why the following did not work: map(lambda k,v: (k,v[0],v[1])) ) """ noOfPassengersOwnModuleToday = paxRecords.map(lambda record: (record[OWN_MODULE_NO],(record[TOTAL_ALIGHTING], record[TOTAL_BOARDING]))) \ .updateStateByKey(updatePassengerCount) \ .map(lambda v: (v[0],v[1][0],v[1][1])) paxRecordsWindowStationLine = paxRecords.window(1800,20) # compute aggregates on a 30 min window updated every 20 sec paxRecordsTable = paxRecords.window(900,900) # save to permanent storage every 15 min (how large/small amounts of data is optimal to save at a time?) LogToKinesis("creatingfunc", "Streams set up OK") """ except Exception as e: LogToKinesis("creatingfunc", "EXCEPTION", str(e)) # output streams try: #paxRecords.foreachRDD(processPax) #noOfPassengersOwnModuleToday.foreachRDD(processOwnModuleState) # send sum of alightings and boardings and pax present onboard for each train to Kinesis #paxRecordsWindowStationLine.foreachRDD(processStationLineWindow) #send aggregates to Kinesis periodically, i.e. last 30 mins updated every 20 secs #paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically kinesisStream.foreachRDD(processKinesisPax) except Exception as e: LogToKinesis("mainLoop", "EXCEPTION", str(e)) ssc.checkpoint(CHECKPOINTDIR) return ssc
def test_kinesis_stream(self): import random kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000))) kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2) try: kinesisTestUtils.createStream() aWSCredentials = kinesisTestUtils.getAWSCredentials() stream = KinesisUtils.createStream( self.ssc, kinesisAppName, kinesisTestUtils.streamName(), kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(), InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY, aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey()) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) stream.foreachRDD(get_output) self.ssc.start() testData = [i for i in range(1, 11)] expectedOutput = set([str(i) for i in testData]) start_time = time.time() while time.time() - start_time < 120: kinesisTestUtils.pushData(testData) if expectedOutput == set(outputBuffer): break time.sleep(10) self.assertEqual(expectedOutput, set(outputBuffer)) except: import traceback traceback.print_exc() raise finally: self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
sc = SparkContext() # Connect to the hive context of our spark context. sqlContext = HiveContext(sc) # Define an external hive table from the PARQUET files stored in S3 to be used to retrieve the schema of the data. # The schema will be used to parse the messages coming from the Kinesis stream and thus must match it. sqlContext.sql( "CREATE EXTERNAL TABLE IF NOT EXISTS yellow_trips_schema( pickup_timestamp BIGINT, dropoff_timestamp BIGINT, vendor_id STRING, pickup_datetime TIMESTAMP, dropoff_datetime TIMESTAMP, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT, passenger_count INT, trip_distance FLOAT, payment_type STRING, fare_amount FLOAT, extra FLOAT, mta_tax FLOAT, tip_amount FLOAT, tolls_amount FLOAT, total_amount FLOAT, store_and_fwd_flag STRING) STORED AS PARQUET " + "LOCATION 's3://<YOUR_BUCKET_NAME>/kinesis-parquet/'") ssc = StreamingContext(sc, 1) # Create an RDD of a single row just to get the schema. No data will be actually read except for the schema. table = sqlContext.sql("select * from yellow_trips_schema limit 1") # Connect to the Kinesis stream - create an RDD of stream messages lines = KinesisUtils.createStream( ssc, appName, kinesisStreamName, 'https://kinesis.us-east-1.amazonaws.com', 'us-east-1', InitialPositionInStream.LATEST, 2) # Iterate over messages as they arrive. lines.foreachRDD(write_lines) # Since we are using a streaming context we need tell the streaming context to start polling for new stream events. ssc.start() # The line below will keep the job running until it is explicitly stopped. ssc.awaitTermination()
print("--------------------------------------------------------") for record in iter: temperature = json.loads(record)["value"] if(temperature > 25): sendEmail() print("--------------------------------------------------------") sc = SparkContext() ssc = StreamingContext(sc, 1) streamName = 'lynf-datastream' appName = 'lynf_data' endpointUrl = '<kinesis endpointUrl>' regionName = 'region_name' dstream = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.TRIM_HORIZON, 5) # py_rdd = dstream.map(lambda x: json.loads(x)) dstream.foreachRDD(lambda rdd: rdd.foreachPartition(sendPartition)) # py_rdd.pprint(10) # py_rdd.saveAsTextFiles("s3n://maweijun-test4/lynf_data/output.txt") ssc.start() ssc.awaitTermination() # ssc.stop() ------------------------------------------------------------------------------------------ # 提交任务 pyspark: spark-submit --packages org.apache.spark:spark-streaming-kinesis-asl_2.11:2.4.2 spark-streaming.py
client = boto3.client('kinesis') sc = SparkContext() ssc = StreamingContext(sc, 10) sqlc = SQLContext(sc) appName = "Lufthansa_1" streamName = "Lufthansa" endpointUrl = "https://kinesis.us-east-2.amazonaws.com" regionName = "us-east-2" awsAccessKeyId = "AKIAS5TGRVYITEV4Z4MH" awsSecretKey = "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" lines = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2, awsAccessKeyId, awsSecretKey) def transformer(rdd): my_obj = json.loads(rdd) return (my_obj["Departure"]["AirportCode"], my_obj["Departure"]["ScheduledTimeLocal"]["DateTime"], my_obj["Departure"]["ScheduledTimeUTC"]["DateTime"], my_obj["Departure"]["TimeStatus"]["Code"], my_obj["Arrival"]["AirportCode"], my_obj["Arrival"]["ScheduledTimeLocal"]["DateTime"], my_obj["Arrival"]["ScheduledTimeUTC"]["DateTime"], my_obj["Arrival"]["TimeStatus"]["Code"], my_obj["OperatingCarrier"]["AirlineID"], my_obj["OperatingCarrier"]["FlightNumber"],
aws_region = 'us-west-2' # replace w/ AWS region used for Kinesis stream kinesis_stream = 'spark_streaming_kinesis_demo' # replace with your Kinesis stream name kinesis_endpoint = 'https://kinesis.' + aws_region + '.amazonaws.com' # the public endpiont of the AWS region this is executed from kinesis_app_name = 'alex_test_app' # app name used to track process through the Kinesis stream kinesis_initial_position = InitialPositionInStream.LATEST # InitialPositionInStream.TRIM_HORIZON | InitialPositionInStream.LATEST kinesis_checkpoint_interval = 10 # define how long to checkpoint when processing through the Kinesis stream spark_batch_interval = 10 # how many seconds before pulling the next batch of data from the Kinesis stream # configure spark elements spark_context = SparkContext(appName=kinesis_app_name) # spark_streaming_context = StreamingContext(sc, 1) # sc valid for running in pyspark interactive mode spark_streaming_context = StreamingContext(spark_context, spark_batch_interval) kinesis_stream = KinesisUtils.createStream( spark_streaming_context, kinesis_app_name, kinesis_stream, kinesis_endpoint, aws_region, kinesis_initial_position, kinesis_checkpoint_interval ) # previous example had ', StorageLevel.MEMORY_AND_DISK_2' at the end of the call # take kinesis stream JSON data and convert to CSV # just realized we're still dealing with dstreams, not RDD, so naming is inaccurate py_dict_rdd = kinesis_stream.map(lambda x: json.loads(x)) # need to convert int (time_stamp & random_int) to string csv_rdd = py_dict_rdd.map(lambda x: x['user_name'] + ',' + str( datetime.datetime.utcfromtimestamp(x['time_stamp'])) + ',' + x[ 'data_string'] + ',' + str(x['random_int'])) # save that rdd to S3 commit_to_s3 = csv_rdd.saveAsTextFiles( 's3://' + s3_target_bucket_name + '/spark_streaming_processing/ ' + datetime.datetime.isoformat(datetime.datetime.now()).replace(':', '_')) # commit_to_s3 = kinesis_stream.saveAsTextFiles('s3://mattsona-public/' + datetime.datetime.isoformat(datetime.datetime.now()).replace(':','_'))
See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on the Kinesis Spark Streaming integration. """ from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream if __name__ == "__main__": if len(sys.argv) != 5: print( "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>", file=sys.stderr) sys.exit(-1) sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl") ssc = StreamingContext(sc, 1) appName, streamName, endpointUrl, regionName = sys.argv[1:] lines = KinesisUtils.createStream( ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.TRIM_HORIZON, 2) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
sc.addPyFile(CODE_PATH + '/constants.py') sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", S3ACCESSID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", S3SECRETKEY) sqlContext = SQLContext(sc) registerUDF(sqlContext) printOnConsole('Streaming started') kinesisStream = [ KinesisUtils.createStream(ssc, APPLICATION_NAME, STREAM_NAME, ENDPOINT, REGION_NAME, INITIAL_POS, CHECKPOINT_INTERVAL, awsAccessKeyId=AWSACCESSID, awsSecretKey=AWSSECRETKEY, storageLevel=STORAGE_LEVEL) for _ in range(NUM_STREAMS) ] unifiedStream = ssc.union(*kinesisStream) print 'Started running' #unikinesisStream.reduceByKey(lambda x,y: x+y) #unifiedStream.count().pprint() unifiedStream.foreachRDD(processRdd)
return final_path def handle_rdd(rdd): print("---------> Processing new RDD") rdd_count = rdd.count() print('---------> Count of Initial RDD {}'.format(rdd_count)) if rdd_count > 0: lang = 'en' rdd_transformed = rdd.map(lambda e: process_event(e)) print('---------> Count of Transformed RDD {}'.format( rdd_transformed.count())) rdd_filtered = rdd_transformed.filter(lambda e: e['lang'] == lang) # just a simple example to filter the RDD print('---------> Count of Filtered RDD {}'.format( rdd_filtered.count())) rdd.saveAsTextFile(build_path(lang=lang)) dstream = KinesisUtils.createStream(streaming_ctx, app_name, kinesis_stream_name, kinesis_endpoint_url, region_name, InitialPositionInStream.LATEST, windows_size_secs, StorageLevel.MEMORY_AND_DISK_2) dstream.foreachRDD(handle_rdd) streaming_ctx.start() streaming_ctx.awaitTermination() # streaming_ctx.stop()
conf.set("spark.mongodb.output.uri", consumer_conf["MONGO_CONNECTION_STRING"]) spark_session = SparkSession.builder.config(conf=conf).getOrCreate() spark_context = spark_session.sparkContext ## Streaming context spark_streaming_context = StreamingContext(spark_context, spark_batch_interval) sql_context = SQLContext(spark_context) #gsdmm = spark_context.broadcast(model) ## Create Kinesis Stream kinesis_stream = KinesisUtils.createStream( spark_streaming_context, kinesis_app_name, kinesis_stream, kinesis_endpoint, aws_region, kinesis_initial_position, kinesis_checkpoint_interval) ## Convert strings to objects myrdd = kinesis_stream.map(convert_json) ## Process entry data point myrdd.foreachRDD(process) ## Start process and awaits spark_streaming_context.start() spark_streaming_context.awaitTermination() spark_streaming_context.stop() except Exception as e: print(e) pass
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream appName="PythonKinesisApp" sc = SparkContext(appName=appName) ssc = StreamingContext(sc, 1) streamName = 'DemoStream' endpointUrl = 'https://kinesis.us-east-1.amazonaws.com' regionName = 'us-east-1' AWS_ACCESS_KEY_ID = '' SECRET_ACCESS_KEY = '' checkpointInterval = 5 kinesisstream = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, checkpointInterval, awsAccessKeyId=AWS_ACCESS_KEY_ID, awsSecretKey=SECRET_ACCESS_KEY) lines = kinesisstream.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() time.sleep(600) # Run stream for 10 minutes just in case no detection of producer # ssc.awaitTermination() ssc.stop(stopSparkContext=True,stopGraceFully=True) # ## References # 1. https://spark.apache.org/docs/latest/streaming-kinesis-integration.html # 2. https://spark.apache.org/docs/latest/streaming-programming-guide.html#performance-tuning
except: pass if __name__ == "__main__": if len(sys.argv) != 5: print( "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>", file=sys.stderr) sys.exit(-1) num_streams = 4 sc = SparkContext(appName="Spark Streaming App") ssc = StreamingContext(sc, 60) appName, streamName, endpointUrl, regionName = sys.argv[1:] kinesis_streams = [ KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 10) for _ in range(num_streams) ] unioned_streams = ssc.union(*kinesis_streams) # Split the spark context lines by the newline delimiter lines = unioned_streams.flatMap(lambda x: x.split("\n")) # For each dstream RDD, apply the processing lines.foreachRDD(process) ssc.start() ssc.awaitTermination()
else: process_dataframe_global(lines, connect, spark, schema) else: conf = SparkConf().setAppName(args.app_name) sc = SparkContext(conf=conf) spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() ssc = StreamingContext(sc, args.batch_duration) sql = SQLContext(sc) lines = KinesisUtils.createStream( ssc, args.app_name, args.stream_name_kinesis, args.endpoint_url_kinesis, args.region_name, InitialPositionInStream.LATEST, awsAccessKeyId=args.aws_access_key_id, awsSecretKey=args.aws_secret_access_key, checkpointInterval=args.checkpoint_interval) lines.pprint() if args.type == "rdd": process_rdd(lines, connect, spark, schema) else: if args.all_or_batch == "batch": process_dataframe(lines, connect, sql, schema) else: process_dataframe_global(lines, connect, sql, schema) ssc.start() ssc.awaitTermination()
See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on the Kinesis Spark Streaming integration. """ import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream if __name__ == "__main__": if len(sys.argv) != 5: print( "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>", file=sys.stderr) sys.exit(-1) sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl") ssc = StreamingContext(sc, 1) appName, streamName, endpointUrl, regionName = sys.argv[1:] lines = KinesisUtils.createStream(ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
s3_target_bucket_name = 'mattsona-spark-demo' # replace with your bucket name for target data aws_region = 'us-west-2' # replace w/ AWS region used for Kinesis stream kinesis_stream = 'spark_streaming_kinesis_demo' # replace with your Kinesis stream name kinesis_endpoint = 'https://kinesis.' + aws_region + '.amazonaws.com' # the public endpiont of the AWS region this is executed from kinesis_app_name = 'alex_test_app' # app name used to track process through the Kinesis stream kinesis_initial_position = InitialPositionInStream.LATEST # InitialPositionInStream.TRIM_HORIZON | InitialPositionInStream.LATEST kinesis_checkpoint_interval = 10 # define how long to checkpoint when processing through the Kinesis stream spark_batch_interval = 10 # how many seconds before pulling the next batch of data from the Kinesis stream # configure spark elements spark_context = SparkContext(appName=kinesis_app_name) # spark_streaming_context = StreamingContext(sc, 1) # sc valid for running in pyspark interactive mode spark_streaming_context = StreamingContext(spark_context, spark_batch_interval) kinesis_stream = KinesisUtils.createStream( spark_streaming_context, kinesis_app_name, kinesis_stream, kinesis_endpoint, aws_region, kinesis_initial_position, kinesis_checkpoint_interval) # previous example had ', StorageLevel.MEMORY_AND_DISK_2' at the end of the call # take kinesis stream JSON data and convert to CSV # just realized we're still dealing with dstreams, not RDD, so naming is inaccurate py_dict_rdd = kinesis_stream.map(lambda x: json.loads(x)) # need to convert int (time_stamp & random_int) to string csv_rdd = py_dict_rdd.map(lambda x: x['user_name'] + ',' + str(datetime.datetime.utcfromtimestamp(x['time_stamp'])) + ',' + x['data_string'] + ',' + str(x['random_int'])) # save that rdd to S3 commit_to_s3 = csv_rdd.saveAsTextFiles('s3://' + s3_target_bucket_name + '/spark_streaming_processing/ '+ datetime.datetime.isoformat(datetime.datetime.now()).replace(':','_')) # commit_to_s3 = kinesis_stream.saveAsTextFiles('s3://mattsona-public/' + datetime.datetime.isoformat(datetime.datetime.now()).replace(':','_')) spark_streaming_context.start() spark_streaming_context.awaitTermination()
sc = SparkContext(conf=conf) ssc = StreamingContext(sc, SPARK_STREAM_BATCH) sc.addPyFile(CODE_PATH + '/pyspark_csv.py') sc.addPyFile(CODE_PATH + '/constants.py') sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", S3ACCESSID) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", S3SECRETKEY) sqlContext = SQLContext(sc) registerUDF(sqlContext) printOnConsole('Streaming started') kinesisStream = [KinesisUtils.createStream(ssc, APPLICATION_NAME, STREAM_NAME, ENDPOINT, REGION_NAME, INITIAL_POS, CHECKPOINT_INTERVAL, awsAccessKeyId =AWSACCESSID, awsSecretKey=AWSSECRETKEY, storageLevel=STORAGE_LEVEL) for _ in range (NUM_STREAMS)] unifiedStream = ssc.union(*kinesisStream) print 'Started running' #unikinesisStream.reduceByKey(lambda x,y: x+y) #unifiedStream.count().pprint() unifiedStream.foreachRDD(processRdd) ssc.start() ssc.awaitTermination() printOnConsole('Streaming suspended')
return israel_negative_sentences if __name__ == '__main__': if len(sys.argv) != 10: print( "Usage: <app-name> <stream-name> <endpoint-url> <region-name> <aws-result-bucket> <aws-access-key> <aws-secret-key>" ) sys.exit(-1) app_Name, streamName, end_point_url, region_name, aws_result_bucket, kinesis_key, kinesis_secret, bucket_key, bucket_secret = sys.argv[ 1:] sparkContext = SparkContext(appName=app_Name) streamingContext = StreamingContext(sparkContext, 2) dstream = KinesisUtils.createStream(streamingContext, app_Name, streamName, end_point_url, region_name, InitialPositionInStream.TRIM_HORIZON, 10, awsAccessKeyId=kinesis_key, awsSecretKey=kinesis_secret) dstream\ .flatMap(tokenize_text)\ .map(analyze_sentence)\ .foreachRDD(lambda x: upload_records_step(x, aws_result_bucket, region_name, bucket_key, bucket_secret)) streamingContext.start() streamingContext.awaitTermination()
from pyspark.sql import SparkSession from pyspark.streaming.context import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate() ssc = StreamingContext(spark.sparkContext, 10) lines = KinesisUtils.createStream( ssc, "test", "test_s", "https://kinesis.eu-north-1.amazonaws.com", "eu-north-1", InitialPositionInStream.LATEST, awsAccessKeyId="AKIAJ5V6NEAI3YNTWGDA", awsSecretKey="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", checkpointInterval=2)
See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on the Kinesis Spark Streaming integration. """ from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream if __name__ == "__main__": if len(sys.argv) != 5: print( "Usage: kinesis_wordcount_asl.py <app-name> <stream-name> <endpoint-url> <region-name>", file=sys.stderr) sys.exit(-1) sc = SparkContext(appName="PythonStreamingKinesisWordCountAsl") ssc = StreamingContext(sc, 1) appName, streamName, endpointUrl, regionName = sys.argv[1:] lines = KinesisUtils.createStream( ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
from pyspark import SparkContext from pyspark.streaming import StreamingContext interval = 1 spark_context = SparkContext(appName='base.py') stream_context = StreamingContext(spark_context, interval) StreamName = 'test' region_name = 'us-west-2' endpoint = 'https://kinesis.{}.amazonaws.com/'.format(region_name) from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream stream = KinesisUtils.createStream(stream_context, 'EventLKinesisConsumer', StreamName, endpoint, region_name, InitialPositionInStream.LATEST, interval)
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream sc = SparkContext(appName="SparkKinesisApp") ssc = StreamingContext(sc, 1) lines = KinesisUtils.createStream(ssc, "SparkKinesisApp", "myStream", "[https://kinesis.us-east-1.amazonaws.com", "us-east-1", InitialPositionInStream.LATEST, 2) #lines.saveAsTextFiles('/home/zh/streaming_logsout.txt') lines.pprint() counts = lines.flatMap(lambda line: line.split(" ")).map( lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) counts.pprint() ssc.start() ssc.awaitTermination()