def load_flume(self, ssc: StreamingContext) -> DStream: # stream that pulls inputs from Flume # maybe change host name print("LOADING FLUME") input_stream = FlumeUtils.createStream(ssc, self.__flume_host, self.__flume_port) d_stream = input_stream.map(self.__parse_json).transform( lambda rdd: self.__convert_service_format(rdd)) return d_stream
def _startContext(self, n, compressed): # Start the StreamingContext and also collect the result dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(), enableDecompression=compressed) result = [] def get_output(_, rdd): for event in rdd.collect(): if len(result) < n: result.append(event) dstream.foreachRDD(get_output) self.ssc.start() return result
def main(): sc = SparkContext(appName="News_Steam_Analysis") # Create the flume stream ssc = StreamingContext( sc, 300 ) # Use the time here to decide what should be the interval for the top stories. flume_strm = FlumeUtils.createStream(ssc, "localhost", 9999, bodyDecoder=lambda x: x) lines = flume_strm.map(lambda (k, v): json.loads(v)) lines.foreachRDD(get_trending_news) ssc.start() ssc.awaitTermination()
exp = pattern.match(line) if exp: request = exp.groupdict()["request"] if request: requestFields = request.split() if (len(requestFields) > 1): return requestFields[1] if __name__ == "__main__": sc = SparkContext(appName="StreamingFlumeLogAggregator") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 1) flumeStream = FlumeUtils.createStream( ssc, "localhost", 9092) # DStream Object named flumeStream lines = flumeStream.map(lambda x: x[1]) urls = lines.map(extractURLRequest) # Reduce by URL over a 5-minute window sliding every second urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow( lambda x, y: x + y, lambda x, y: x - y, 300, 1) # Sort and print the results sortedResults = urlCounts.transform( lambda rdd: rdd.sortBy(lambda x: x[1], False)) sortedResults.pprint() ssc.checkpoint("/home/maria_dev/checkpoint") ssc.start()
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.flume import FlumeUtils import uuid if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: flume_wordcount.py <hostname> <port>", file=sys.stderr) sys.exit(-1) uid = str(uuid.uuid4()) sc = SparkContext(appName="PythonStreamingFlumeWordCount") ssc = StreamingContext(sc, 1) hostname, port = sys.argv[1:] kvs = FlumeUtils.createStream(ssc, hostname, int(port)) lines = kvs.map(lambda x: x[1]) counts = lines.filter(lambda line: "sales" in line.lower())\ .map(lambda word: (uid, word)) \ .reduceByKey(lambda a, b: a+b) \ .saveAsTextFiles('hdfs://0.0.0.0:8020/weblogs/sales_','txt') ssc.start() ssc.awaitTermination()
exp = pattern.match(line) if exp: request = exp.groupdict()["request"] if request: requestFields = request.split() if (len(requestFields) > 1): return requestFields[1] if __name__ == "__main__": sc = SparkContext(appName="StreamingFlumeLogAggregator") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 1) flumeStream = FlumeUtils.createStream(ssc, "localhost", 9092) lines = flumeStream.map(lambda x: x[1]) urls = lines.map(extractURLRequest) # Reduce by URL over a 5-minute window sliding every second urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow( lambda x, y: x + y, lambda x, y: x - y, 300, 1) # Sort and print the results sortedResults = urlCounts.transform( lambda rdd: rdd.sortBy(lambda x: x[1], False)) sortedResults.pprint() ssc.checkpoint("/home/maria_dev/checkpoint") ssc.start()
sys.stdout.flush() productSum = NewProductSum(pair[0]) for it in pair[1]: productSum['revenue'] += Decimal(it['revenue']) if it['type'] == 'view': productSum['views'] += 1 else: productSum['purchases'] += 1 return result def ProcessInput(rdd): rdd.groupBy(lambda rdd: rdd['product_id']).map(Test).foreachPartition(WriteData) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: flume_wordcount.py <hostname> <port>", file = sys.stderr) sys.exit(-1) sparkContext = SparkContext(appName = "SparkFlumeStreaming") sparkContext.setLogLevel('ERROR') streamingContext = StreamingContext(sparkContext, 1) hostname, port = sys.argv[1:] print('Start listening at {}:{}'.format(hostname, port)) stream = FlumeUtils.createStream(streamingContext, hostname, int(port)) stream.map(lambda x: x[0]).window(60, 60).foreachRDD(ProcessInput) streamingContext.start() streamingContext.awaitTermination()
request = exp.groupdict()["request"] if request: requestFields = request.split() if (len(requestFields) > 1): return requestFields[1] if __name__ == "__main__": # setup sparkcomtext object, set log level, setup streaming context, interval of 1 sec sc = SparkContext(appName="StreamingFlumeLogAggregator") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 1) #user flume util library. push model from flume to spark flumeStream = FlumeUtils.createStream(ssc, "192.168.1.59", 9092) #map operation lines = flumeStream.map(lambda x: x[1]) urls = lines.map(extractURLRequest) # Reduce by URL over a 5-minute window sliding every second urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow( lambda x, y: x + y, lambda x, y: x - y, 300, 1) # Sort and print the results sortedResults = urlCounts.transform( lambda rdd: rdd.sortBy(lambda x: x[1], False)) sortedResults.pprint() #create check point directory
def salvaResultado(rdd): #coloca entradas no formato #(hora_da_req, quem_respondeu, tempo_de_resposta) linhas = rdd.map(lambda linha: (converte_data_redis(linha[6][1:len(linha[6])-1]), linha[8].split("/")[1], linha[9].split("/")[3])) for log in linhas.collect(): salva_tempo_mais_recente(log[0]) salva_req_redis(1, log[0]) salva_t_srv_queue_redis(log[2], log[0]) # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "acessos") ssc = StreamingContext(sc, 20) stream_flume_logs = FlumeUtils.createStream(ssc, "10.125.8.253", 44444) #Pegar cada linha do log linha_log = stream_flume_logs.map(lambda a: a[1]).filter(lambda a: "haproxy" in a) #words = linha_log.flatMap(lambda line: line.split(" ")) words = linha_log.map(lambda line: line.split()) #words.pprint() #Processar dados e salvar no banco Influxdb words.foreachRDD(salvaResultado) ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext from pyspark.streaming.flume import FlumeUtils # Local SparkContext and StreamingContext (batch interval of 1 second) sc = SparkContext(master="local[*]", appName="Flume-DStream-StdOut", conf=SparkConf() .set("spark.jars.packages", "org.apache.spark:spark-streaming-flume_2.11:2.4.7")) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 1) # 1. Input data: create a DStream from Apache Flume stream = FlumeUtils.createStream(ssc, "localhost", 4444) # 2. Data processing: get first element lines = stream.map(lambda x: x[1]) # 3. Output data: show result in the console lines.pprint() ssc.start() ssc.awaitTermination()
def load_flume(self, ssc: StreamingContext) -> DStream: stream = FlumeUtils.createStream(ssc, self.__flume_host, self.__flume_port) # Map applies an operation to each element in the stream, whereas transform applies an operation on an RDD level return stream.map(self.__parse_json) \ .transform(lambda rdd: self.__convert_service_format(rdd))
from pyspark.streaming.flume import FlumeUtils from pyspark import SparkContext from pyspark.streaming import StreamingContext def salvaResultado(rdd): #a=rdd.foreach(lambda dado: dado.sortByKey()) a=rdd.sortByKey().collect() np.savetxt('/home/marceloca/teste',a, fmt='%s') #rdd.foreach(lambda dado: np.savetxt('/home/marceloca/teste', dado, fmt='%s')) # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "acessos") ssc = StreamingContext(sc, 10) stream_flume_logs = FlumeUtils.createStream(ssc, "192.168.0.13", 44444) #Definicao de dict para transformacao de data cal = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr' : '04', \ 'May' : '05', 'Jun' : '06', 'Jul' : '07', 'Aug' : '08', 'Sep' : '09', \ 'Oct' : '10', 'Nov' : '11', 'Dec' : '12' } #Pegar cada linha do log linha_log = stream_flume_logs.map(lambda a: a[1]) #words = linha_log.flatMap(lambda line: line.split(" ")) words = linha_log.map(lambda line: line.split(" ")) #Extrair a data de cada acesso datas = words.map(lambda data: str(cal[data[0]]) + str(data[1]))
print("Usage: flume_wordcount.py <hostname> <port>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingFlumeWordCount") ssc = StreamingContext(sc, 1) # Create regular expression pattern to parse log messages' mainbody. PATTERN = '^(\S*\s\S*\s\S*)(.*)' def parseLogLine(logline): match = re.search(PATTERN, logline) return (Row( date_time=match.group(1), mainbody=match.group(2), ), 1) # Pairwise count each lines,then print out. hostname, port = sys.argv[1:] kvs = FlumeUtils.createStream(ssc, hostname, int(port)) lines = kvs.map(lambda x: x[1]) Errorcounts = (lines.map(parseLogLine) .filter(lambda s: s[1] == 1) .map(lambda s: s[0].mainbody) .filter(lambda s: "ERROR" in s) .map(lambda log: (log, 1)) .reduceByKey(lambda a, b: a + b)) Warningcounts = (lines.map(parseLogLine) .filter(lambda s: s[1] == 1) .map(lambda s: s[0].mainbody) .filter(lambda s: "WARNING" in s) .map(lambda log: (log, 1)) .reduceByKey(lambda a, b: a + b)) Errorcounts.pprint() Warningcounts.pprint()
# from pyspark.streaming import StreamingContext # from pyspark import SparkContext # from pyspark.streaming.flume import FlumeUtils # sc = SparkContext() # ssc = StreamingContext(sc, 10) # flumeStream = FlumeUtils.createStream(ssc, "localhost", 6669) # result = flumeStream.map(lambda x: json.loads(x[1])) # result.pprint() # ssc.start() # ssc.awaitTermination() from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.flume import FlumeUtils sc = SparkContext(appName="PythonStreamingFlumeWordCount") ssc = StreamingContext(sc, 10) kvs = FlumeUtils.createStream(ssc, "localhost", int(6669)) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
#https://issues.apache.org/jira/browse/PARQUET-222 - Parquet writer memory allocation def process_proxy(time, rdd): output_rdd = rdd.map(lambda x: str(time) + ' ' + x[0]['host'] + ' ' + x[1]) \ .filter(lambda x: '-net-bc' in x).map(parse) \ .filter(lambda x: isinstance(x, Row)).repartition(10) return output_rdd '''Main function''' if __name__ == '__main__': appConfig = conf.Config() sc = SparkContext(conf=appConfig.setSparkConf()) ssc = StreamingContext(sc, 600) logParser = Parser(type='flume') flumeStream = FlumeUtils.createStream(ssc, '10.129.4.182', 5141) flumeStream1 = FlumeUtils.createStream(ssc, '10.129.4.175', 5141) flumeStream2 = FlumeUtils.createStream(ssc, '10.129.4.174', 5141) flumeStream3 = FlumeUtils.createStream(ssc, '10.129.4.178', 5141) unionStream = flumeStream.union(flumeStream1).union(flumeStream3).union(flumeStream3) #fwDStream = flumeStream.transform(process_fw) proxyDStream = unionStream.transform(process_proxy) #fwDStream.foreachRDD(save_fw) proxyDStream.foreachRDD(save_proxy) #proxyDStream.saveAsTextFiles("sg_") ''' genericRDD = rdd.filter(lambda x: any(y in x[0]['host'] for y in ['msr-off-fw', '-net-bc']) == False)
#!/usr/bin/python # -*- coding: UTF-8 -*- import pyspark_init as pi from pyspark.streaming.flume import FlumeUtils import pyspark ssc = pi.streaming_init('streaming_flume1', 'local[2]', 3) host = 'localhost' port = 44444 dsm = FlumeUtils.createStream(ssc, host, port, pyspark.StorageLevel.MEMORY_AND_DISK_SER_2) dsm.count().map(lambda x: 'Recieve ' + str(x) + ' Flume events!!!!').pprint() ssc.start() ssc.awaitTerminationOrTimeout(120) ssc.stop()
# ‐*‐ coding: UTF‐8 ‐*‐ ###spark streaming&&Flume from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.flume import FlumeUtils sc=SparkContext("yarn","FlumeWordCount") # 處理時間間隔為2秒 ssc=StreamingContext(sc,2) # 開啟TCP socket ip & port lines = FlumeUtils.createStream(ssc, "1.1.1.1",12345) lines1=lines.map(lambda x:x[1]) # 對兩秒內收到的字串做分割 words=lines1.flatMap(lambda line:line.split(" ")) # word count pairs=words.map(lambda word:(word,1)) wordcounts=pairs.reduceByKey(lambda x,y:x+y) # 輸出檔案至HDFS 格式為/tmp/flume‐日期 wordcounts.saveAsTextFiles("/tmp/flume") # 檢查檔案內容 wordcounts.pprint() # 啟動spark streaming ssc.start() # 等待計算終止 ssc.awaitTermination()