def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)   # Create a streaming context with batch interval of 10 sec
    ssc.checkpoint("checkpoint")
    geolocator = Nominatim()
    stream(ssc,geolocator,100) 
Пример #2
0
def start_spark(timeout=None, max_items_per_rdd_sent=None):
    sc = SparkContext("local[4]", "twitter.trending")
    ssc = StreamingContext(sc, 5)

    ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')

    kafka_params = {
        'zookeeper.connect': config.get('zookeeper', 'host'),
        'group.id': config.get('kafka', 'group_id'),
        'metadata.broker.list': config.get('kafka', 'hosts')
    }

    ksc = KafkaUtils.createDirectStream(ssc,
                                        [config.get('kafka', 'topic')],
                                        kafka_params)

    hashtag_counts = get_word_counts(ksc)
    filtered_tweet_count = filter_tweets(hashtag_counts)
    send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
    ssc.start()
    if timeout:
        ssc.awaitTermination(timeout)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
    else:
        ssc.awaitTermination()
Пример #3
0
def createStreamingContext():

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
    ssc = StreamingContext(sc, 2)

    # Create a DStream of raw data
    raw = ssc.socketTextStream(MASTER_IP, 9999)

    # Convert into models
    tweets = raw.map(lambda r: Tweet(raw_json=r))

    # Store models
    tweets.foreachRDD(storeTweetsRDD)

    # Sliding window analysis
    window = tweets.window(20*60, 30)
    hashtagCounts = analysisHahtagCount(window)
    streamTop(hashtagCounts).pprint()

    # Keyword extraction - note tweets is immutable
    tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

    # Update models
    tweetsKeyword.foreachRDD(updateTweetsRDD)

    # Sliding window analysis
    window2 = tweetsKeyword.window(20*60, 30)
    keywordCounts = analysisKeywordCount(window2)
    streamTop(keywordCounts).pprint()

    ssc.checkpoint(CHECKPOINT_DIR)
    return ssc
def main():
    conf = SparkConf().setMaster("local[2]").setAppName("Streamer")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)   # Create a streaming context with batch interval of 10 sec
    ssc.checkpoint("checkpoint")
    pwords = load_wordlist("positive.txt")
    nwords = load_wordlist("negative.txt")
    counts = stream(ssc, pwords, nwords, 100)
    make_plot(counts)
Пример #5
0
def functionToCreateContext():
    sc = SparkContext(appName="StreamingExampleWithKafka")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
    kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
    counts.pprint()
    return ssc
def main():
    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set("spark.dynamicAllocation.enabled", "true")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 1) # Stream every 1 second
    ssc.checkpoint("checkpoint")

    # Clear the cassandra table
    init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))

    stream_kafka(ssc)
        def createContext():
            uBATCH_INTERVAL = 10
            sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
            sc.broadcast(batchUserPostDict)
            sc.broadcast(batchPostUserDict)
            #sc = SparkContext("local[*]", appName="StreamingKafka")
            # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
            ssc = StreamingContext(sc, uBATCH_INTERVAL)
            ssc.checkpoint(CHECKPOINT_DIR)   # set checkpoint directory in HDFS
            #ssc.checkpoint(10 * uBATCH_INTERVAL)
            return ssc

            ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
        pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
        runningCounts = pairs.updateStateByKey(updateFunction)

        sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
Пример #9
0
def createStreamingContext():
    conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature")
    conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("/tmp/spark-streaming-amqp")

    receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature")

    temperature = receiveStream.map(getTemperature)
    max = temperature.reduceByWindow(getMax, None, 5, 5)

    max.pprint()

    return ssc
Пример #10
0
def main():
    global ssc

    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set('spark.streaming.stopGracefullyOnShutdown', True)

    sc = SparkContext(conf=conf)

    ssc = StreamingContext(sc, 1)  # Stream every 1 second
    ssc.checkpoint("/tmp/checkpoint")

    signal.signal(signal.SIGINT, stop_streaming)

    stream_kafka()
Пример #11
0
def functionToCreateContext():
    sc = SparkContext("local[*]", "streaming_part")
    sc.setLogLevel("ERROR")
    
    ssc = StreamingContext(sc, 5)
    
    data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999)
    data_from_camera_mechine = ssc.socketTextStream("localhost", 9998)
    
    
    #meat
    data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
    data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
    
    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
    return ssc
def functionToCreateContext():
  # spark context config
  sc = SparkContext(appName="StreamingExampleWithKafka")
  ssc = StreamingContext(sc, 10)
  ssc.checkpoint("checkpoint")
  
  # kafka
  opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
  kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
  # processing
  lines = kvs.map(lambda x: x[1])
  counts = lines.flatMap(lambda line: line.split(" ")) \
   .map(lambda word: (word, 1)) \
   .updateStateByKey(updateFunction) \
   .map(toStringList) \
   .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv))
  return ssc
Пример #13
0
def functionToCreateContext():
    # new context
    conf = SparkConf()
    conf = conf.setAppName(APP_NAME)
    sc   = SparkContext(conf=conf)
    
    # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
    sc.addPyFile("common.py")
    
    # As argument Spark Context and batch retention
    ssc = StreamingContext(sc, 10)
    
    # set checkpoint directory
    ssc.checkpoint(CHECKPOINT_DIR)
    
    # return streaming spark context
    return ssc
def createContext(brokers, topic, checkpointDir):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint

    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    lines = kvs.map(lambda x: x[1])
    wordCounts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)

    #wordCounts.foreachRDD(echo)
    wordCounts.pprint()
    ssc.checkpoint(checkpointDir)
    return ssc
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
	# filter out header and other invalid rows
	rdd = lines.map(lambda line: line.split(',')).filter(lambda words: len(words) > 56)
        # extract first field (for filtering header), Carrier, Orig, Dest, and delay fields
	rdd2 = rdd.map(lambda x: (x[0], x[8], x[11], x[18], x[52], x[53], x[54], x[55], x[56])).map(lambda line: [str(w.replace('"','')) for w in line]).filter(lambda row: row[0] != 'Year' and any(row[4:]))
	rdd2.pprint()

    	# sum up delay fields for each row
	sum_delay_rdd = rdd2.map(sum_delay)
	sum_delay_rdd.pprint()

    	# sum up delay for each (orig, dest, carrier) pair
	combined_rdd = sum_delay_rdd.updateStateByKey(updateFunction)
	combined_rdd.pprint()

    	# calculate avg delay
	avg_rdd = combined_rdd.transform(lambda rdd: rdd.map(lambda (x, y): ((x[0], x[1]), (y[0]/float(y[1]), x[2]))))
	avg_rdd.pprint()

    	# group by (orig, dest)
	avg_rdd_by_route = avg_rdd.groupByKey()

    	# sort by on time performance for each (orig, dest) route and take top 10
	route_sorted_carrier = avg_rdd_by_route.mapValues(lambda x: sorted(list(x))[:10])
	aa = route_sorted_carrier.flatMapValues(lambda x: x)

    	aa.pprint()
	aa.foreachRDD(process)

        return ssc
def creatingfunc():
  # create streaming context
  ssc = StreamingContext(sc, batchIntervalSeconds)
  LogToKinesis("creatingfunc", "StreamingContext", str(dir(ssc)))
  ssc.remember(10*batchIntervalSeconds)
  
  # setup streams
  try: 
    #paxRecords = ssc.textFileStream(SOURCE).map(ParsePassengerRecord)  # parse and enrich pax data
    kinesisStream = KinesisUtils.createStream(ssc, KINESIS_APPNAME, KINESIS_STREAM, KINESIS_ENDPOINT_URL, KINESIS_REGION, InitialPositionInStream.TRIM_HORIZON, 10, StorageLevel.MEMORY_AND_DISK_2, ACCESS_KEY, SECRET_KEY)
    LogToKinesis("kinesisStream", "KinesisUtils.createStream", str(dir(kinesisStream)))
    
    # track total boarding and alighting per train/ownmoduleno
    # Note: rdd returned by updateStateByKey is (ownmoduleno, (alight, board))
    # for easy conversion to dataframe we map this rdd to (ownmoduleno, alight, board). (Not shure why the following did not work: map(lambda k,v: (k,v[0],v[1])) )
    """
    noOfPassengersOwnModuleToday = paxRecords.map(lambda record: (record[OWN_MODULE_NO],(record[TOTAL_ALIGHTING], record[TOTAL_BOARDING])))  \
                              .updateStateByKey(updatePassengerCount) \
                              .map(lambda v: (v[0],v[1][0],v[1][1]))  
        
    paxRecordsWindowStationLine = paxRecords.window(1800,20)  # compute aggregates on a 30 min window updated every 20 sec
    paxRecordsTable = paxRecords.window(900,900) # save to permanent storage every 15 min (how large/small amounts of data is optimal to save at a time?)
    LogToKinesis("creatingfunc", "Streams set up OK")
    """
  except Exception as e:
    LogToKinesis("creatingfunc", "EXCEPTION", str(e))
 
  # output streams
  try: 
    #paxRecords.foreachRDD(processPax)
    #noOfPassengersOwnModuleToday.foreachRDD(processOwnModuleState) # send sum of alightings and boardings and pax present onboard for each train to Kinesis
    #paxRecordsWindowStationLine.foreachRDD(processStationLineWindow) #send aggregates to Kinesis periodically, i.e. last 30 mins updated every 20 secs
    #paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically
    kinesisStream.foreachRDD(processKinesisPax)
  except Exception as e:
    LogToKinesis("mainLoop", "EXCEPTION", str(e))

  ssc.checkpoint(CHECKPOINTDIR)
  return ssc
Пример #17
0
    response = requests.post(url, data=post_data)


if __name__ == "__main__":
    # os.environ["PYSPARK_PYTHON"] = "python3"
    # os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
    os.environ[
        'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

    # create spark configuration
    conf = SparkConf()
    conf.setAppName("TwitterSentiment")
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)
    # sc.setLogLevel("ERROR")
    # streaming data will be divided into batches every 10s
    ssc = StreamingContext(sc, 10)
    # dataStream = ssc.socketTextStream(HOST, PORT).window(windowDuration=10, slideDuration=10)
    dataStream = KafkaUtils.createStream(ssc, ZOOKEEPER, 'spark-streaming', {'china': 1}) \
        .window(windowDuration=10, slideDuration=10)

    dataStream.pprint()
    (dataStream.map(lambda line: line[1].lower()).filter(
        lambda word: len(word) > 0).map(lambda word:
                                        (word, ))  # map to a tuple (word,)
     .foreachRDD(predict))

    ssc.checkpoint("checkpoints_sentiment")
    ssc.start()
    ssc.awaitTermination()
Пример #18
0
	rediscon=redis.StrictRedis(host='ec2-52-40-47-83.us-west-2.compute.amazonaws.com', port=6379, db=0,password='')
	pipe = rediscon.pipeline()
	for i in s:
		key='stream'
		value=i[1]
		val=str(i[0])+','+str(value[0])+','+str(value[1])
		pipe.lpush(key,val)
	pipe.execute()


appName='Stream_APP'
master='spark://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:7077'
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)
ssc.checkpoint("hdfs://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:9000/user/spark_checkpoint")
brokers="52.41.140.111:9092,52.41.90.5:9092,52.41.120.152:9092"
kvs = KafkaUtils.createDirectStream(ssc, ["stream_test"], {"metadata.broker.list": brokers})

user_metric = kvs.map(split_features)

user_count=kvs.map(split_count)

count_stream =  user_count.reduceByKeyAndWindow(lambda x, y: x + y, 3, 1)

sum_stream = user_metric.reduceByKeyAndWindow(window_sum_hr_speed, 3, 1)

joined_stream=sum_stream.join(count_stream)

smoothened_stream= joined_stream.map(computations)
# Import libs
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Begin
if __name__ == "__main__":
        sc = SparkContext(appName="StreamingErrorCount");
        # 2 is the batch interval : 2 seconds
        ssc = StreamingContext(sc, 2)

        # Checkpoint for backups
        ssc.checkpoint("file:///tmp/spark")

        # Define the socket where the system will listen
        # Lines is not a rdd but a sequence of rdd, not static, constantly changing
        lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))



        # Update function
        def countWords(newValues, lastSum):
            if lastSum is None :
                lastSum = 0
            return sum(newValues, lastSum)

        word_counts = lines.flatMap(lambda line: line.split(" "))\
                    .map(lambda word : (word, 1))\
                    .updateStateByKey(countWords)

        ## Display the counts
Пример #20
0
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SQLContext
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession

import sys
import json

configuration = SparkConf()
configuration.setAppName('Project')

spark_context = SparkContext(conf=configuration)
spark = SparkSession(spark_context)
streaming_context = StreamingContext(spark_context, 4)

streaming_context.checkpoint('Project Checkpoint')
input_stream = streaming_context.socketTextStream('localhost', 6100)

##########################################################

chemistry = dict()

sql = SQLContext(spark_context)

##########################################################

## Player Profile

player_profile = {}

# Open a csv reader called DictReader
Пример #21
0
# 3. spark-submit  --jars spark-streaming-kafka-assembly_2.10-1.4.1.jar ./alerts/pairs_corr.py vsu-01:2181

# http://stackoverflow.com/questions/3425439/why-does-corrcoef-return-a-matrix
#

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "Usage: pairs_corr_redis.py <broker_list ex: vsu-01:2181> <queue_name - for saving the correlations series>"
        )
        exit(-1)

    app_name = "IbMarketDataStream"
    sc = SparkContext(appName=app_name)  #, pyFiles = ['./cep/redisQueue.py'])
    ssc = StreamingContext(sc, 2)
    ssc.checkpoint('./checkpoint')

    brokers, qname = sys.argv[1:]

    #
    # demonstrate how to use broadcast variable
    #

    Q = sc.broadcast({
        'qname': qname,
        'namespace': 'mdq',
        'host': 'localhost',
        'port': 6379,
        'db': 3
    })
Пример #22
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    hostname = "localhost"
    port_number = 8888

    sc = SparkContext(appName="ErrorCounter")
    ssc = StreamingContext(sparkContext=sc, batchDuration=2)
    ''' batchDuration=2 --> all messages received within a 2s window from one RDD within the DStream. '''

    ''' Create a new directory, this one will be used as a checkpoint storagepoint.'''
    ssc.checkpoint("/home/wesley/Documents/spark/checkpoints")

    ''' Lines is a DStream of RDDs, it's NOT a static collection of RDDs as it's constantly being updated'''
    lines = ssc.socketTextStream(hostname=hostname, port=port_number)

    error_count = lines.flatMap(lambda line: line.split(" ")) \
        .filter(lambda word: "ERROR" in word) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda sum_occurrences, next_occurrence: sum_occurrences + next_occurrence)

    '''  Print out the result.'''
    error_count.pprint()

    '''# Start listening for streaming data.'''
    ssc.start()
    '''  Wait infinitely for streaming data unless you explicitly terminate the application.'''
    ssc.awaitTermination()
Пример #23
0
    # get hostname and port of data source from application arguments
    hostname = sys.argv[1]
    port = int(sys.argv[2])
     
    # Create a new SparkContext
    sc = SparkContext()

    # Set log level to ERROR to avoid distracting extra output
    sc.setLogLevel("ERROR")

    # Create and configure a new Streaming Context 
    # with a 1 second batch duration
    ssc = StreamingContext(sc,1)

    # Enable checkpointing (required for window operations)
    ssc.checkpoint("logcheckpt")

    # Create a DStream of log data from the server and port specified    
    logStream = ssc.socketTextStream(hostname,port)

    # Every two seconds, display the total number of requests over the 
    # last 5 seconds
    # countStream = logStream.countByWindow(5,2)
    # countStream.pprint()
    
    # ---------------------
    # Bonus: Display the top 5 users every second
    
    # Count requests by user ID for every batch
    userreqStream = logStream \
        .map(lambda line: (line.split(' ')[2],1)) \
            
            r2.sort(key = lambda x: x[1],reverse=True)
            
            if(len(r2)>2):
                print("%s,%s,%s,%s,%s"%(r2[0][0],r2[1][0],r2[2][0],r2[3][0],r2[4][0]))
        except:
            pass

if __name__ == "__main__":

    conf=SparkConf()
    conf.setAppName("BigData")
    sc=SparkContext(conf=conf)

    ssc=StreamingContext(sc,int(sys.argv[2])) #2 is batch durtion
    ssc.checkpoint("/checkpoint_BIGDATA") #checkpoint is for recovering the lost data 

    dataStream=ssc.socketTextStream("localhost",9009)  
    
    tweet=dataStream.map(lambda w:(w.split(';')[7]))
    #tweet.pprint()
    hashtag = tweet.flatMap(lambda w :compute(w))
    
    h = hashtag.window(int(sys.argv[1]),1)
    
    count=hashtag.reduceByKeyAndWindow(lambda x, y: x + y,lambda x,y:x-y,int(sys.argv[1]),1)
    #count.pprint()

    #To Perform operation on each RDD
    count.foreachRDD(process_rdd)
    
Пример #25
0
    return (msg.topic, data.PulseId())
    #return data.PulseId()


tlist = []
numTopic = 2
topicHead = 'SparkTest-'
for i in range(numTopic):
    tlist.append(topicHead + str(i))

sc = SparkContext(appName="mytstApp")
sc.setLogLevel("ERROR")  # 减少shell打印日志
ssc = StreamingContext(sc, 1)
#tlist = ['Spark_1','Spark_2']
checkpoint_dir = './Checkpoint/spark'
ssc.checkpoint(checkpoint_dir)

kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "group.id": "myUserGroup",
    "enable.auto.commit": "false",
    "auto.offset.reset": "smallest"
}
dstream = [KafkaUtils.createDirectStream(ssc, [tlist[0]], kafka_params,\
      keyDecoder=spot_decoder,\
      valueDecoder=spot_decoder,\
            messageHandler=setHandler )\
           for i in range(len(tlist))
           ]
countList = []
Пример #26
0
    fin = interval * duracion
    return (fin - duracion, fin)


def fUpdate(newValues, history):
    return set((history or [])).union(newValues)


conf = SparkConf().setMaster("local[2]").setAppName("ContarCoordenadas")
sc = SparkContext(conf=conf)
sc.setLogLevel("OFF")
ssc = StreamingContext(sc, 5)

initial = sc.parallelize([])
stream = ssc.socketTextStream("localhost", 7777)
ssc.checkpoint(persistent + "counts")

counts = stream.map(lambda line: line.split(";")).map(
    lambda x: (get_interval(int(x[3])), x[0]))

history = counts.updateStateByKey(fUpdate, initialRDD=initial)

# https://spark.apache.org/docs/latest/streaming-programming-guide.html#dataframe-and-sql-operations
history.foreachRDD(
    lambda time, rdd: print("  duracion {} -- (intervalo - autos) {}".format(
        duracion,
        rdd.mapValues(lambda a: len(a)).sortByKey(False).map(lambda a: (
            as_tuple_range(a[0]), a[1])).collect(),
    )))

ssc.start()
Пример #27
0
                x += ','
        print(x[:-1])
        x = ""
    except:
        e = sys.exc_info()[0]


window_size = int(sys.argv[1])
batch_size = int(sys.argv[2])

conf = SparkConf()
conf.setAppName("FifaApp")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, batch_size)
ssc.checkpoint("/usr/local/hadoop/checkpoint_FifaApp")
dataStream = ssc.socketTextStream("localhost", 9009)

#tweet = dataStream.window(window_size, 1)
#tweet = dataStream.map(lambda w : w.split(';')[7])

tweet = dataStream.map(lambda w: w.split(';')[7])
tweet = tweet.flatMap(lambda x: x.split(','))
tweet = tweet.map(lambda y: (y, 1))
#tagsTotal = tweet.reduceByKey(aggregate_tags_count)
#tagsTotal = tweet.reduceByKeyAndWindow(aggregate_tags_count,None,window_size,1)
tagsTotal = tweet.updateStateByKey(aggregate_tags_count)
tagsTotal.foreachRDD(process_rdd)
#tagsTotal.pprint()

ssc.start()
Пример #28
0
    else:
        return 0


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print(
            "Usage: kafka-stream-find-word-example.py <broker_list> <topic> <word>",
            file=sys.stderr)
        exit(-1)

    file_path = "/path/to/output_file/file_name"
    checkpointDirectoryLocation = "/path/to/checkpoint/"

    sc = SparkContext(appName="StreamingKafkaExample")

    ssc = StreamingContext(sc, 5)  # read every 5 sec
    ssc.checkpoint(checkpointDirectoryLocation)  # required for Window function

    brokers, topic, word = sys.argv[1:]
    kvs = KafkaUtils.createDirectStream(ssc, [topic],
                                        {"metadata.broker.list": brokers})
    lines = kvs.map(lambda x: x[1])

    ct = lines.map(lambda line: check_for_word(line, word, file_path))\
        .reduceByWindow(lambda a, b: a+b, lambda a, b: a-b, 30, 30)
    ct.pprint()

    ssc.start()
    ssc.awaitTermination()
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    # Create the Spark context
    sc = SparkContext(appName="PythonStreamingApp")
    # Necessary log4j logging level settings are done
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)
    # Create the Spark Streaming Context with 10 seconds batch interval
    ssc = StreamingContext(sc, 10)
    # Set the check point directory for saving the data to recover when there is a crash
    ssc.checkpoint("\tmp")
    # The quorum of Zookeeper hosts
    zooKeeperQuorum = "localhost"
    # Message group name
    messageGroup = "sfb-consumer-group"
    # Kafka topics list separated by coma if there are multiple topics to be listened on
    topics = "sfb"
    # Number of threads per topic
    numThreads = 1
    # Create a Kafka DStream
    kafkaStream = KafkaUtils.createStream(ssc, zooKeeperQuorum, messageGroup,
                                          {topics: numThreads})
    # Create the Kafka stream
    appLogLines = kafkaStream.map(lambda x: x[1])
    # Count each log messge line containing the word ERROR
    errorLines = appLogLines.filter(lambda appLogLine: "ERROR" in appLogLine)
Пример #30
0
    return flipped_sorted.map(lambda (x, y): (y, x))


stream = KafkaUtils.createDirectStream(
    ssc, [config.TOPIC], {"metadata.broker.list": config.KAFKA_SERVER})

lines = stream.map(lambda x: x[1])
# lines.pprint()

# Reduce by author's timezone and effective user mentions over a sliding window.
tweet_locations = lines.map(extract_location).filter(
    lambda loc: loc != 'None').map(lambda x: (x, 1))
tweet_mentions = lines.flatMap(extract_mentions).filter(lambda user: user[0])

locations_agg = tweet_locations \
                .reduceByKeyAndWindow(lambda a, b: (a + b), lambda x, y: x - y, config.WINDOW_SIZE, config.SLIDE_INTERVAL) \
                .transform(lambda rdd: sort_by_value(rdd))

mentions_agg = tweet_mentions \
                .reduceByKeyAndWindow(lambda a, b: (a + b), lambda x, y: x - y, config.WINDOW_SIZE, config.SLIDE_INTERVAL) \
                .transform(lambda rdd: sort_by_value(rdd))

locations_agg.pprint()
mentions_agg.pprint()

# Metadata checkpointing is primarily needed for recovery from driver failures,
# whereas data or RDD checkpointing is necessary even for basic functioning if stateful transformations are used.
# http://spark.apache.org/docs/latest/streaming-programming-guide.html#checkpointing
ssc.checkpoint(config.SPARK_CHECKPOINT_DIR)
ssc.start()
ssc.awaitTermination()
    for record in taken[:num]:
    	print(record)
    	result.append(record)

    ws = create_connection(url)
    ws.send(json.dumps(result))
    ws.close()

    if len(taken) > num:
        print("...")
    print("")

def updateFunc(new_values, last_sum):
        return sum(new_values) + (last_sum or 0)

sc = SparkContext(appName="PythonTwitterStreaming")
ssc = StreamingContext(sc, 1)

tweets = ssc.socketTextStream('localhost', 9999)
ssc.checkpoint("./checkpoint-tweet")

running_counts = tweets.flatMap(lambda line: line.split(" "))\
                          .map(lambda word: (word, 1))\
                          .updateStateByKey(updateFunc).transform(lambda rdd: rdd.sortBy(lambda x: x[1],False))


running_counts.foreachRDD(takeAndPrint)

ssc.start()
ssc.awaitTermination()
Пример #32
0
        print(hashtag_counts_df)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)


def tmp(x):
    return (x.split(';')[0], 1)


conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, 2)
ssc.checkpoint("/home/anjali/Desktop/sem5/")

dataStream = ssc.socketTextStream("localhost", 9009)
# dataStream.pprint()
tweet = dataStream.map(tmp)
# OR
tweet = dataStream.map(lambda w: (w.split(';')[0], 1))
count = tweet.reduceByKey(lambda x, y: x + y)
#count.pprint()
#TO maintain state
totalcount = tweet.updateStateByKey(aggregate_tweets_count)
#totalcount.pprint()

#To Perform operation on each RDD
totalcount.foreachRDD(process_rdd)
totalcount.pprint()
Пример #33
0
Description: 
'''
import sys

from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: WindowedSocketStreaming.py <hostname> <port>", file=sys.stderr)
        exit(-1)
    conf = SparkConf().setAppName('Windowed Streaming').setMaster("local[3]")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)

    ssc.checkpoint("file:///mnt/data1/workspace/data_analysis_mining/Python_Spark/spark_tutorial/data/socket/stateful")

    def update_func(new_value, last_num):
        return sum(new_value) + (last_num or 0)
    
    initialState = sc.parallelize([('hello', 1), ('world', 1)])
    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
    running_counts = lines.flatMap(lambda line: line.split(' '))\
                  .map(lambda word: (word, 1))\
                  .updateStateByKey(update_func, 
                                    initialRDD=initialState)

    running_counts.saveAsTextFiles("file:///mnt/data1/workspace/data_analysis_mining/Python_Spark/spark_tutorial/data/socket/output")                     
    running_counts.pprint()
    
    ssc.start()
Пример #34
0
######
###### Main script #######
######

signal.signal(signal.SIGINT, signal_handler)

dynamo = dynamodb2.connect_to_region(AWS_REGION)
out_table = Table(DB_TABLE, connection=dynamo)

config = SparkConf()
config.set('spark.streaming.stopGracefullyOnShutdown', True)
#config.set('spark.yarn.executor.memoryOverhead', '2g')

sc = SparkContext(appName='g2ex2', conf=config, pyFiles=['flight.py'])
ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp/g2ex2')

lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

filtered = lines.map(lambda line: line.split(","))\
                .map(lambda fields: Flight(fields))\
                .filter(lambda fl: fl.Cancelled == 0)\
                .map(lambda fl: ((fl.Origin, fl.Dest), (fl.DepDelay, 1)))\
                .updateStateByKey(updateFunction)

filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition))

# start streaming process
ssc.start()

try:
        i = i + 1
    if hashh != "":
        print("%s" % (hashh))


if len(sys.argv) != 3:
    print("Should enter file, Window Size, Batch Duration", file=sys.stderr)
    sys.exit(-1)
wind_size = int(sys.argv[1])
batch_duration = int(sys.argv[2])
conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, batch_duration)
ssc.checkpoint("home/hduser/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweet = dataStream.map(lambda w: (w.split(';')[7]))

hashtag = tweet.flatMap(lambda w: (w.split(',')))
hasht = hashtag.map(lambda w: (w, 1))
counts = hasht.filter(lambda x: x[0] != '')

totalcount = counts.reduceByKeyAndWindow(
    lambda a, b: a + b, wind_size,
    batch_duration).transform(lambda rdd: rdd.sortBy(lambda y: (-y[1], y[0])))
#print(totalcount)
totalcount.foreachRDD(process_rdd)
Пример #36
0
import sys
import json
import redis

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

def publishToRedis(tup):
    tweet = tup
    pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
    r = redis.StrictRedis(connection_pool=pool)
    r.publish("twitterchannel", tweet)

if __name__ == '__main__':
    sc = SparkContext(appName="PythonTwitterStreaming")
    ssc = StreamingContext(sc, 1)
    tweetStream = KafkaUtils.createStream(ssc, 'localhost:2181', "kafka-stream-redis", {'tweets': 1})
    tweets = tweetStream.map(lambda x: x[1])
    ssc.checkpoint("./checkpoint-tweet")
    tweets.foreachRDD(lambda rdd: rdd.foreach(publishToRedis))
    ssc.start()
    ssc.awaitTermination()
Пример #37
0
wordPairs = words.map(lambda word: (word, 1))
hashtagPairs = hashtags.map(lambda hashtag: (hashtag, 1))

# Complete a wordcount using a key and 20 minute window
wordCounts = wordPairs \
    .reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), 1200,
                          10)  # Last 20 minutes, updates every 10 seconds
hashtagCount = hashtagPairs \
    .reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), 1200,
                          10)  # Last 20 minutes, updates every 10 seconds

# Sort the words and hashtags in decending order
sortedWordCount = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))
sortedHashtagCount = hashtagCount.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False))

# Send word and hashtag counts to the api
sortedWordCount.foreachRDD(lambda rdd: send_data_to_api(rdd.collect(), 'http://localhost:5000/incomingWordCount', 'count'))
sortedHashtagCount.foreachRDD(lambda rdd: send_data_to_api(rdd.collect(), 'http://localhost:5000/incomingHashtagCount', 'count'))

# Store filtered tweets to the database and send them to the api
tweet_objects.foreachRDD(lambda rdd: store_and_send_tweet(rdd))

# Save counts to file
sortedWordCount.saveAsTextFiles("./spark_data/word_counts/".format(str(datetime.now()) + ".json"))
sortedHashtagCount.saveAsTextFiles("./spark_data/hashtag_counts/".format(str(datetime.now()) + ".json"))

# Starts the streaming context
ssc.checkpoint("./spark_data/checkpoints/")
ssc.start()
ssc.awaitTermination()
Пример #38
0
if __name__ == "__main__":

    stopwords = read_stopwords("./stopwords.txt")

    ss = SparkSession.builder \
        .appName("Spark Structured Streaming from Twitter") \
        .getOrCreate()

    sc = ss.sparkContext

    ssc = StreamingContext(sc, 2)

    # setting a checkpoint to allow RDD recovery
    ssc.checkpoint(
        f"hdfs://{os.environ['HDFS_HOST_NAME']}:{os.environ['HDFS_HOST_PORT']}/checkpoint"
    )

    # read data from port 9009
    dataStream = ssc.socketTextStream(os.environ["TWITTER_CLIENT"],
                                      os.environ["TWITTER_PORT"])

    # split each tweet into words
    words = dataStream.flatMap(lambda line: line.lower().split(" "))

    # filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1)
    hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1))
    # adding the count of each hashtag to its last count
    tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
    # do processing for each RDD generated in each interval
    tags_totals.foreachRDD(process_hashtags_rdd)
Пример #39
0
def input_preporcess(line):
    fields = line.split(",")
    return ((str(fields[ORIGIN_COL]), 1),
            (str(fields[DEST_COL]), 1)) if fields[DEST_COL] != "" else None


def updateFunction(newValues, runningCount):
    return sum(newValues) + (runningCount or 0)


if __name__ == '__main__':
    # set up
    sc = SparkContext(appName="q11")
    ssc = StreamingContext(sc, TimeOut)
    brokers = BootStarpServers
    topic = TopicName
    sc.setLogLevel("WARN")
    ssc.checkpoint("/tmp/q11")

    kvs = KafkaUtils.createDirectStream(ssc, [topic], KafkaParams)

    # key logic
    lines = kvs.map(lambda x: x[1])
    rst = lines.flatMap(input_preporcess).filter(
        lambda x: x != None).updateStateByKey(updateFunction)
    rst.foreachRDD(output)

    # start program
    ssc.start()
    ssc.awaitTermination()
Пример #40
0
##Developer should start REPL using:
#pyspark --master local[2]

#3a
from pyspark.streaming import StreamingContext

#3b
ssc = StreamingContext(sc, 2)

#3c
inputDS = ssc.socketTextStream("sandbox", 9999)

#3d
ssc.checkpoint("hdfs:///user/root/checkpointDir")

#3e
windowDS = inputDS.window(10,2).flatMap(lambda line:line.split(" ")) \
.map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)

#3f
windowDS.pprint()

#3g
sc.setLogLevel("ERROR")

#3h
ssc.start()
Пример #41
0
TCP_IP = 'localhost'
TCP_PORT = 9006

# Pyspark
# create spark configuration
config = SparkConf()
config.setAppName('TwitterApp')
config.setMaster('local[2]')
config.set("spark.network.timeout","4200s")
config.set("spark.executor.heartbeatInterval","4000s")
# create spark context with the above configuration
spark_con = SparkContext(conf=config)

# create the Streaming Context from spark context with interval size 2 seconds
sparkstreamconf = StreamingContext(spark_con, 4)
sparkstreamconf.checkpoint("checkpoint_TwitterApp")

elastic_search = Elasticsearch([{'host': 'localhost', 'port': 9200}])





def filter_emoji(text_json):
    
    text = text_json['text']
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
Пример #42
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

sc = SparkContext(appName="AirportRank")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("checkpoint")

# lines = ssc.textFileStream("/user/otp")
kvs = KafkaUtils.createDirectStream(ssc, ["flights"], {"metadata.broker.list": "hdp-master:9092"})


def print_top_list(rdd):
    for (count, word) in rdd.take(10):
        print("%s: %i" % (word, count))


#  print '[%s]' % ', '.join(map(str, rdd.take(10)))


def updateFunc(new_values, last_sum):
    return sum(new_values) + (last_sum or 0)


lines = kvs.map(lambda x: x[1])
running_counts = lines.flatMap(lambda line: line.split(",")[4:6]).map(lambda apt: (apt, 1)).updateStateByKey(updateFunc)

# reduceByKey(lambda x, y: x + y)
top = running_counts.map(lambda x: (x[1], x[0])).transform(lambda rdd: rdd.sortByKey(False))
top.foreachRDD(print_top_list)
# top.pprint()
        temp_ave = sum(i) / len(i)
        ave_list.append(temp_ave)
    # print(len(ave_list))
    # print(sorted(ave_list))
    return sorted(ave_list)[-2]


# port_num = 9999
port_num = int(sys.argv[1])
# output_name = "task2.csv"
output_name = sys.argv[2]
output = open(output_name, 'w')
output.write("Time, Ground Truth, Estimation\n")
output.flush()
os.fsync(output.fileno())

sc = SparkContext("local[2]", "hw6")
ssc = StreamingContext(sc, 5)
ssc.checkpoint('checkpoint')

initialStateRDD = sc.parallelize([('temp_key', {'city': []})])
lines = ssc.socketTextStream("localhost", port_num)
key_city = lines.map(lambda x: (
    'temp_key', int(binascii.hexlify(json.loads(x)['city'].encode('utf8')), 16
                    ))).updateStateByKey(updateFunc,
                                         initialRDD=initialStateRDD)
key_city.pprint()
ssc.start()
ssc.awaitTermination()

# int(binascii.hexlify(new_value[1].encode('utf8')), 16)
Пример #44
0
import sys
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.streaming import StreamingContext

APP_NAME = "Group2_2_Top10DestinationsOntimeDepartureFromX"
STREAMING_INTERVAL = 1

master = str(sys.argv[1]) # 'ec2-xx-xx-xx-xx.compute-1.amazonaws.com'
originAirport = str(sys.argv[2])
dataFilePathOnHdfs = "hdfs://{}/btsdata/aviation/ontime/".format(master)

conf = SparkConf().setAppName(APP_NAME).setMaster('spark://{}:7077'.format(master))
sc = SparkContext(conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
ssc.checkpoint('/tmp/ccc')

lines = ssc.textFileStream(dataFilePathOnHdfs)



res2_2 = lines.map(lambda line : line.split(","))				\
			  .filter(lambda line : line[6] == originAirport)		\ 	# 2nd argument: 'SRQ', 'CMH', 'JFK', 'SEA', or 'BOS'
			  .map(lambda line : (line[7], float(line[12])))	\	# (Carrier, Departure Delay)
			  .combineByKey(lambda x : (x, 1), 					\
			  				lambda x, y : (x[0] + y, x[1] + 1), \	# (sum, count)
			  				lambda x, y : (x[0] + y[0], x[1] + y[1]) ) \
			  .map(lambda (key, (valueSum, count) : (key, valueSum / count))) \
			  .sortByKey('ascending')

Пример #45
0
        if process_state == 'DEAD_PROCESS': return PROC_STOPPED
    return PROC_UNKNOWN

def event_count_to_text((event,count)):
    if event == PROC_STARTED: return "Started: " + str(count)
    if event == PROC_STOPPED: return "Stopped: " + str(count)
    return ""

BATCH_DURATION = 10
WINDOW_DURATION = 60 
SLIDE_DURATION = 20

sc = SparkContext(appName='PythonStreamingQueueStream')
ssc = StreamingContext(sc, BATCH_DURATION)

ssc.checkpoint('ckpt')

ssc.socketTextStream("localhost", 9999)\
  .map(line_to_event)\
  .filter(lambda event: event == PROC_STARTED or event == PROC_STOPPED)\
  .map(lambda event: (event,1))\
  .reduceByKey(lambda count1,count2: count1+count2)\
  .pprint()
  # .reduceByKeyAndWindow(
  #     func=lambda count1, count2: count1 + count2,
  #     invFunc=lambda count1, count2: count1 - count2, 
  #     windowDuration=WINDOW_DURATION,
  #     slideDuration=SLIDE_DURATION)\
  # .map(event_count_to_text)\
  # .transform(lambda rdd: rdd.sortBy(lambda text: text))\
  # .saveAsTextFiles('process-stats')
Пример #46
0
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":

    sc = SparkContext(appName="StreamingErrorCount")
    ssc = StreamingContext(sc, 1)
    
    ssc.checkpoint("hdfs:///user/hdp/streaming")
    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
    counts = lines.flatMap(lambda line: line.split(" "))\
                  .filter(lambda word:"ERROR" in word)\
                  .map(lambda word: (word, 1))\
                  .reduceByKey(lambda a, b: a+b)
    
    counts.pprint()
    ssc.start()
    ssc.awaitTermination()
Пример #47
0
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SQLContext, Row

title = "Iteration #3"

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: " + sys.argv[0] + " <zk> <topic> </checkpoint/dir>", file=sys.stderr)
        exit(-1)

    zkQuorum, topic, checkpointDirectory = sys.argv[1:]

    sc = SparkContext(appName=title)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint(checkpointDirectory)

    kafkaStream = KafkaUtils.createStream(ssc, zkQuorum, "iter-consumer", {topic: 1})

    event = kafkaStream.map(lambda x: x[1])
    lines = event.flatMap(lambda line: line.split("\n"))
    fields = lines.map(lambda f: f.split(","))
    hexandcall = fields.map(lambda f: (f[4], f[10]), 1)
    #callsign = fields.map(lambda c: (c[10]))
    #notnulls = hexandcall.filter(lambda n: (n != ''))
    #joined = notnulls.join(hexandcall)
    #callsigns = notnulls.map(lambda c: (c, 1))
    #hexidents = hexident.map(lambda h: (h, 1))
    #counts = callsigns.reduceByKeyAndWindow(lambda a, b: a+b, lambda a, b: a-b, 600, 10)
    counts = hexandcall.reduceByKeyAndWindow(lambda a, b: a+b, lambda a, b: a-b, 600, 10)
    counts.pprint()
#!/usr/bin/python
# -*- coding: utf-8 -*-
__author__ = "wxmimperio"

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    conf = SparkConf().setMaster("local[2]").setAppName("window_streaming_demo")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 3)
    # 设置checkpoint到hdfs上
    ssc.checkpoint("hdfs://localhost:9000/checkpiont/streaming_cp_log")

    lines = ssc.socketTextStream("spark-master", 9999)
    wordCounts = (
        lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda x, y: (x + y))
    )

    # 每隔3s,统计一次前6s的数据
    windows = wordCounts.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 6, 3)
    windows.pprint()

    ssc.start()
    ssc.awaitTermination()
    if len(group) > 10:
        group = group[:10]
    return group


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: script.py <zk> <topic>", file=sys.stderr)
        exit(-1)

    zkQuorum, topic = sys.argv[1:]

    sc = SparkContext(appName="KafkaSparkStreaming")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("/tmp/q22")  # mandatory for updateStateByKey

    ks = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                 {topic: 21})

    st1 = ks.map(lambda x: x[1].split(','))
    st2 = st1.flatMap(lambda x: [x[3], x[4]])
    st3 = st2.map(lambda x: (x, 1))
    st4 = st3.reduceByKey(lambda x, y: x + y)
    st5 = st4.map(lambda x: (x[0], x[1])).updateStateByKey(updateFunction)
    st6 = st5.map(lambda (key, value): (True, [(key, value)]))
    st7 = st6.reduceByKey(gettop10)
    st7.pprint()

    ssc.start()
    ssc.awaitTermination()
import sys

from pyspark import SparkContext  # allow us to work with spark
from pyspark.streaming import StreamingContext  # allow to work with streams in spark

if __name__ == "__main__":
    sc = SparkContext("local[2]", "StreamingCount")
    sc.setLogLevel("WARN")

    ssc = StreamingContext(
        sc, 2
    )  # 2 is a batchInterval prop of the DStream created by this StreamingContext

    ssc.checkpoint('file:///tmp/spark')

    lines = ssc.socketTextStream(sys.argv[1],
                                 int(sys.argv[2]))  # hostname and port

    counts = lines.flatMap(lambda line: line.split(" "))\
        .filter(lambda word: "ERROR" in word)\
        .map(lambda word: (word, 1))\
        .reduceByKeyAndWindow(lambda a, b: a + b, lambda a, b: a - b, 20, 2)

    counts.pprint()

    ssc.start()
    ssc.awaitTermination()

# ncat -lk 9999
# spark-submit .\reduce_by_key_and_window.py localhost 9999
Пример #51
0
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SQLContext, Row

title = "Iteration #3"

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: " + sys.argv[0] + " <zk> <topic> </checkpoint/dir>",
              file=sys.stderr)
        exit(-1)

    zkQuorum, topic, checkpointDirectory = sys.argv[1:]

    sc = SparkContext(appName=title)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint(checkpointDirectory)

    kafkaStream = KafkaUtils.createStream(ssc, zkQuorum, "iter-consumer",
                                          {topic: 1})

    event = kafkaStream.map(lambda x: x[1])
    lines = event.flatMap(lambda line: line.split("\n"))
    fields = lines.map(lambda f: f.split(","))
    hexandcall = fields.map(lambda f: (f[4], f[10]), 1)
    #callsign = fields.map(lambda c: (c[10]))
    #notnulls = hexandcall.filter(lambda n: (n != ''))
    #joined = notnulls.join(hexandcall)
    #callsigns = notnulls.map(lambda c: (c, 1))
    #hexidents = hexident.map(lambda h: (h, 1))
    #counts = callsigns.reduceByKeyAndWindow(lambda a, b: a+b, lambda a, b: a-b, 600, 10)
    counts = hexandcall.reduceByKeyAndWindow(lambda a, b: a + b,
Пример #52
0
            new_vals1 + last_vals1)


######
###### Main script #######
######

signal.signal(signal.SIGINT, signal_handler)

config = SparkConf()
config.set('spark.streaming.stopGracefullyOnShutdown', True)
#config.set('spark.yarn.executor.memoryOverhead', '2g')

sc = SparkContext(appName='g1ex3', conf=config, pyFiles=['flight.py'])
ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp/g1ex3')

lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

filtered = lines.map(lambda line: line.split(","))\
                .map(lambda fields: Flight(fields))\
                .filter(lambda fl: fl.Cancelled == 0)\
                .map(lambda fl: (fl.DayOfWeek, (fl.ArrDelay, 1)))\
                .updateStateByKey(updateFunction)

filtered.foreachRDD(lambda rdd: print_rdd(rdd))

# start streaming process
ssc.start()

try:
Пример #53
0
        print word

if __name__ == '__main__':
    
    l = StdOutListener()
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    stream = tweepy.Stream(auth, l)
    stream.filter(track=['cricket'], async=True)
    timeInterval = int(sys.argv[1])
    windowInterval = int(sys.argv[2])

    if windowInterval > timeInterval:
        print "Error: Window interval should be greater than time interval"
        sys.exit(1)

    time.sleep(timeInterval)

    ssc = StreamingContext(sc, 1)
    stream = ssc.queueStream(l.rdds)
    ssc.checkpoint("/Users/highlight/sparkvagrant")
    stream = stream.map(lambda line: (line, 1))

    stream = stream.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, windowInterval, windowInterval, 20)
    
    stream.foreachRDD(get_output)

    ssc.start()
    ssc.awaitTermination()
os.environ["PYSPARK_PYTHON"]='/usr/bin/python3'
os.environ["PYSPARK_DRIVER_PYTHON"]='/usr/bin/python3'
def output(rdd):
	mlist = []
	for i in rdd.take(5):
		if (i != []):
			mlist.append(i)
	alist = [x[0] for x in mlist if x[0] != '']
	#blist = [(x[0],x[1]) for x in mlist]
	if (alist != []):
		print(",".join(alist))
	#print(blist)
conf=SparkConf()
conf.setAppName("BigData")
sc=SparkContext(conf=conf)
sc.setLogLevel("ERROR")
batch_interval = int(sys.argv[2])
window_size = int(sys.argv[1])
ssc=StreamingContext(sc,batch_interval)
ssc.checkpoint("/home/hadoop/checkpoint_BIGDATA424")

dataStream=ssc.socketTextStream("localhost",9009)
tweet=dataStream.map(lambda x: x.split(';')[7]).flatMap(lambda x: x.split(','))
tweet = tweet.countByValueAndWindow(window_size,batch_interval)
sortedtweets = tweet.transform(lambda rdd: rdd.sortBy(lambda x : (-x[1],x[0]),ascending=True))
sortedtweets.foreachRDD(lambda rdd: output(rdd))

ssc.start()
ssc.awaitTermination(100)
ssc.stop()
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
    # Create a SparkSession
    sc = SparkContext(appName="Sai-Streaming")
    ssc = StreamingContext(sc, 2)
    ssc.checkpoint("checkpoint")

    lines = ssc.socketTextStream('192.168.56.1', 23456)
    #lines = ssc.socketTextStream("localhost", 9999)
    lines.pprint()

    def countWords(newVaues, lastSum):
        if lastSum is None:
            lastSum = 0
        return sum(newVaues, lastSum)

    word_counts = lines.flatMap(lambda line: line.split(" "))\
        .filter(lambda w: w.startswith("#"))\
        .map(lambda word: (word,1))\
        .updateStateByKey(countWords)

    word_counts.pprint()

    ssc.start()
    ssc.awaitTermination()

#for starting this go to Terminal
#spark-submit "C:\SaiStudy - LEarn It All - Version9\SaiStudy - Spark_STream_Count.py" 192.168.56.1 9999
#nc -Lp 9999
def process_rdd(rdd):  
    sort = rdd.sortBy(lambda x: (-x[1],x[0])).filter(lambda y: y[0] !='').collect()
    if(sort!=[]):
    #print(sorted_list[:5])
        print(sort[0][0],sort[1][0],sort[2][0],sort[3][0],sort[4][0],sep=",")


def out(l):
  o=l.split(";")[7]
  if(',' not in o):
    return [o]
  return o.split(",")
  

conf=SparkConf()
conf.setAppName("BD3")
sc=SparkContext(conf=conf)

ssc=StreamingContext(sc,int(sys.argv[2]))
ssc.checkpoint("~/checkpoint_BIGDATA2")
dataStream=ssc.socketTextStream("localhost",9009)

hashtags=dataStream.window(int(sys.argv[1]),1)
all_hashtags=hashtags.flatMap(out)
result=all_hashtags.map(lambda h : (h,1))
final_result=result.reduceByKey(lambda x,y:int(x)+int(y))
final_result.foreachRDD(process_rdd)
ssc.start()
ssc.awaitTermination(25)
ssc.stop()
Пример #57
0
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    # Create the Spark context
    sc = SparkContext(appName="DataIngestionApp")
    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)
    # Create the Spark Streaming Context with 10 seconds batch interval
    ssc = StreamingContext(sc, 10)
    # Check point directory setting
    ssc.checkpoint("\tmp")
    # Zookeeper host
    zooKeeperQuorum="localhost"
    # Kaka message group
    messageGroup="sfb-consumer-group"
    # Kafka topic where the programming is listening for the data
	# Reader TODO: Here only one topic is included, it can take a comma separated string containing the list of topics. 
	# Reader TODO: When using multiple topics, use your own logic to extract the right message and persist to its data store
    topics = "message"
    numThreads = 1    
    # Create a Kafka DStream
    kafkaStream = KafkaUtils.createStream(ssc, zooKeeperQuorum, messageGroup, {topics: numThreads})
    messageLines = kafkaStream.map(lambda x: x[1])
    # This is where the messages are printed to the console. Instead of this, implement your own persistence logic
    messageLines.pprint()
    # Start the streaming
		counter+=1
	if(len(string) > 0):
		print(string)
def tmp(x):
	for i in (x.split(';')[7]).split(','):
		return (i,1)

windowDuration = int(sys.argv[1])
slideInterval = int(sys.argv[2])

conf=SparkConf()
conf.setAppName("BigData")
sc=SparkContext(conf=conf)

ssc=StreamingContext(sc,1)
ssc.checkpoint("/checkpoint_BIGDATA")

dataStream=ssc.socketTextStream("localhost",9009)
temp = dataStream.window(windowDuration,slideInterval)
# dataStream.pprint()
tweet=temp.flatMap(lambda x:((x.split(';')[7]).split(",")))
tweet = tweet.map(lambda x:(x,1))
# OR
#tweet=dataStream.map(lambda w:(w.split(';')[0],1))
count=tweet.reduceByKey(lambda x,y:x+y)
#count.pprint()

#TO maintain state
totalcount=tweet.updateStateByKey(aggregate_tweets_count)
# totalcount.pprint()
#To Perform operation on each RDD
    rdd.foreachPartition(partitionOfRecordsFun)


if __name__ == "__main__":
    # checkpoint_path = "hdfs://spark-master:9000/checkpiont/streaming_cp_log"
    checkpoint_path = "tachyon-ft://spark-master:19998/checkpoint/streaming_log"
    kafka_topic_list = ["realdata_receive"]
    broker_list_dit = {"metadata.broker.list": "192.168.108.222:9092"}

    conf = SparkConf().setAppName("streaming_kafka_send")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 5)

    # setDefaultEncoding()
    # ssc = initStreamingContext("streaming_kafka_deltaT", "local[2]", 7)
    ssc.checkpoint(checkpoint_path)

    kvs = KafkaUtils.createDirectStream(ssc, kafka_topic_list, broker_list_dit)
    deltaT = kvs.flatMap(lambda lines: toJson(lines)).map(lambda x: (x["oid"], x)). \
        updateStateByKey(updateFun).foreachRDD(foreachPartitionFun)

    # ensureOffset(kvs=kvs)

    offsetRanges = []

    def storeOffsetRange(rdd):
        global offsetRanges
        offsetRanges = rdd.offsetRanges()
        return rdd

    def printOffsetRange(rdd):
# # do processing for each RDD generated in each interval
# tags_totals.foreachRDD(su.process_rdd)
# # start the streaming computation
# ssc.start()
# # wait for the streaming to finish
# ssc.awaitTermination()

conf = SparkConf()
conf.setAppName("TwitterStreamApp")
# create spark context with the above configuration
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
# create the Streaming Context from the above spark context with interval size 2 seconds
ssc = StreamingContext(sc, 2)
# setting a checkpoint to allow RDD recovery
ssc.checkpoint("checkpoint_TwitterApp")
# read data from port 9009
dataStream = ssc.socketTextStream("localhost", 9009)

# split each tweet into words
words = dataStream.flatMap(lambda line: line.split(" "))
# filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1)
hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1))
# adding the count of each hashtag to its last count
tags_totals = hashtags.updateStateByKey(su.aggregate_tags_count)
# do processing for each RDD generated in each interval
tags_totals.foreachRDD(su.process_rdd)
# start the streaming computation
ssc.start()
# wait for the streaming to finish
ssc.awaitTermination()