Пример #1
0
    def _writeAndVerify(self, ports):
        # Set up the streaming context and input streams
        ssc = StreamingContext(self.sc, self.duration)
        try:
            addresses = [("localhost", port) for port in ports]
            dstream = FlumeUtils.createPollingStream(
                ssc,
                addresses,
                maxBatchSize=self._utils.eventsPerBatch(),
                parallelism=5)
            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            dstream.foreachRDD(get_output)
            ssc.start()
            self._utils.sendDatAndEnsureAllDataHasBeenReceived()

            self.wait_for(outputBuffer, self._utils.getTotalEvents())
            outputHeaders = [event[0] for event in outputBuffer]
            outputBodies = [event[1] for event in outputBuffer]
            self._utils.assertOutput(outputHeaders, outputBodies)
        finally:
            ssc.stop(False)
Пример #2
0
    def _writeAndVerify(self, ports):
        # Set up the streaming context and input streams
        ssc = StreamingContext(self.sc, self.duration)
        try:
            addresses = [("localhost", port) for port in ports]
            dstream = FlumeUtils.createPollingStream(
                ssc,
                addresses,
                maxBatchSize=self._utils.eventsPerBatch(),
                parallelism=5)
            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            dstream.foreachRDD(get_output)
            ssc.start()
            self._utils.sendDatAndEnsureAllDataHasBeenReceived()

            self.wait_for(outputBuffer, self._utils.getTotalEvents())
            outputHeaders = [event[0] for event in outputBuffer]
            outputBodies = [event[1] for event in outputBuffer]
            self._utils.assertOutput(outputHeaders, outputBodies)
        finally:
            ssc.stop(False)
Пример #3
0
 def load_flume(self, ssc: StreamingContext) -> DStream:
     # stream that pulls inputs from Flume
     # maybe change host name
     print("LOADING FLUME")
     input_stream = FlumeUtils.createStream(ssc, self.__flume_host,
                                            self.__flume_port)
     d_stream = input_stream.map(self.__parse_json).transform(
         lambda rdd: self.__convert_service_format(rdd))
     return d_stream
Пример #4
0
    def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result
Пример #5
0
    def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result
def main():
    sc = SparkContext(appName="News_Steam_Analysis")

    # Create the flume stream
    ssc = StreamingContext(
        sc, 300
    )  # Use the time here to decide what should be the interval for the top stories.
    flume_strm = FlumeUtils.createStream(ssc,
                                         "localhost",
                                         9999,
                                         bodyDecoder=lambda x: x)

    lines = flume_strm.map(lambda (k, v): json.loads(v))

    lines.foreachRDD(get_trending_news)

    ssc.start()
    ssc.awaitTermination()
Пример #7
0
if __name__ == "__main__":
    # if len(sys.argv) != 3:
    #     print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr)
    #     exit(-1)

    PYSPARK_PYTHON = "C:\\Python27\\python.exe"  #多版本python情况下,需要配置这个变量指定使用哪个版本
    os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON

    sc = SparkContext(appName="wc002")
    sqlContext = SQLContext(sc)
    sc.setLogLevel("ERROR")

    ssc = StreamingContext(sc, 5)
    address = [("cdh5-slave2", 9999)]

    fps = FlumeUtils.createPollingStream(ssc, address)

    # ssc.checkpoint("hdfs://cdh-master:8020/checkpoint")     #提交任务的用户要有目录的读写权限!
    # lines = fps.map(lambda x: (x[1])).pprint()


    def p(x):
        print(type(x), x)

    def get_field_value(row):
        # count
        # uuid
        # title
        # reason_type
        # caseid
        # province
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.flume import FlumeUtils

# Local SparkContext and StreamingContext (batch interval of 1 second)
sc = SparkContext(master="local[*]",
                  appName="Flume-DStream-StdOut",
                  conf=SparkConf()
                  .set("spark.jars.packages", "org.apache.spark:spark-streaming-flume_2.11:2.4.7"))
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 1)

# 1. Input data: create a DStream from Apache Flume
stream = FlumeUtils.createStream(ssc, "localhost", 4444)

# 2. Data processing: get first element
lines = stream.map(lambda x: x[1])

# 3. Output data: show result in the console
lines.pprint()

ssc.start()
ssc.awaitTermination()
        cmd = "rm -rf /tmp/Flume_spark_Streaming_Test/*"
        os.system(cmd)

        from pyspark.streaming.flume import FlumeUtils

        port = 9999
        addresses = {(hostname, port)}

        # Pull-based Approach using a Custom Sink
        '''
                Flume pushes data into the sink, and the data stays buffered.
                Spark Streaming uses a reliable Flume receiver(Jar-org.apache.spark.streaming.flume.sink.SparkSink) and transactions to pull
                data from the sink
                '''
        fstream = FlumeUtils.createPollingStream(ssc, addresses)
        #fstream.pprint() # ({}, u'naresh,kumar,22')

        input_stream = fstream.map(lambda x: x[1])

    # Processing the DStream
    '''
        This step will do the required processing/filtering on the main DStream and generate a Tuple or List or raw value(when we directly send
        input_stream without any map/filter).

        In case of any confusion after any map/filter, like what is the Type(list/Tuple/Raw) of DStream, just use print type(xyz) in the function
        where we are sending this DStream. This will print the type in any of the executor (but NOT on the CONSOLE). From there you get an idea about the
        type and process the records accordingly.
        '''

    # This sends records as Tuple - (u'naresh', u'kumar', u'21')
Пример #10
0
def salvaResultado(rdd):
    #coloca entradas no formato
    #(hora_da_req, quem_respondeu, tempo_de_resposta)
    linhas = rdd.map(lambda linha: (converte_data_redis(linha[6][1:len(linha[6])-1]), linha[8].split("/")[1], linha[9].split("/")[3]))

    for log in linhas.collect():
      salva_tempo_mais_recente(log[0])
      salva_req_redis(1, log[0])
      salva_t_srv_queue_redis(log[2], log[0])

# Create a local StreamingContext with two working thread and batch interval of 1 second

sc = SparkContext("local[2]", "acessos")
ssc = StreamingContext(sc, 20)
stream_flume_logs = FlumeUtils.createStream(ssc, "10.125.8.253", 44444)

#Pegar cada linha do log
linha_log = stream_flume_logs.map(lambda a: a[1]).filter(lambda a: "haproxy" in a)

#words = linha_log.flatMap(lambda line: line.split(" "))
words = linha_log.map(lambda line: line.split())
#words.pprint()

#Processar dados e salvar no banco Influxdb
words.foreachRDD(salvaResultado)

ssc.start()             # Start the computation

ssc.awaitTermination()  # Wait for the computation to terminate
Пример #11
0
 def load_flume(self, ssc: StreamingContext) -> DStream:
     stream = FlumeUtils.createStream(ssc, self.__flume_host,
                                      self.__flume_port)
     # Map applies an operation to each element in the stream, whereas transform applies an operation on an RDD level
     return stream.map(self.__parse_json) \
         .transform(lambda rdd: self.__convert_service_format(rdd))
Пример #12
0
        request = exp.groupdict()["request"]
        if request:
            requestFields = request.split()
            if (len(requestFields) > 1):
                return requestFields[1]


if __name__ == "__main__":

    # setup sparkcomtext object, set log level, setup streaming context, interval of 1 sec
    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)

    #user flume util library. push model from flume to spark
    flumeStream = FlumeUtils.createStream(ssc, "192.168.1.59", 9092)

    #map operation
    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extractURLRequest)

    # Reduce by URL over a 5-minute window sliding every second
    urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, 300, 1)

    # Sort and print the results
    sortedResults = urlCounts.transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], False))
    sortedResults.pprint()

    #create check point directory
Пример #13
0
    exp = pattern.match(line)
    if exp:
        request = exp.groupdict()["request"]
        if request:
            requestFields = request.split()
            if (len(requestFields) > 1):
                return requestFields[1]


if __name__ == "__main__":

    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)

    flumeStream = FlumeUtils.createStream(ssc, "localhost", 9092)

    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extractURLRequest)

    # Reduce by URL over a 5-minute window sliding every second
    urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, 300, 1)

    # Sort and print the results
    sortedResults = urlCounts.transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], False))
    sortedResults.pprint()

    ssc.checkpoint("/home/maria_dev/checkpoint")
    ssc.start()
Пример #14
0
    sys.stdout.flush()
    productSum = NewProductSum(pair[0])
    for it in pair[1]:
        productSum['revenue'] += Decimal(it['revenue'])
        if it['type'] == 'view': productSum['views'] += 1
        else: productSum['purchases'] += 1

    return result

def ProcessInput(rdd): rdd.groupBy(lambda rdd: rdd['product_id']).map(Test).foreachPartition(WriteData)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: flume_wordcount.py <hostname> <port>", file = sys.stderr)
        sys.exit(-1)

    sparkContext = SparkContext(appName = "SparkFlumeStreaming")
    sparkContext.setLogLevel('ERROR')

    streamingContext = StreamingContext(sparkContext, 1)

    hostname, port = sys.argv[1:]
    print('Start listening at {}:{}'.format(hostname, port))
    stream = FlumeUtils.createStream(streamingContext, hostname, int(port))

    stream.map(lambda x: x[0]).window(60, 60).foreachRDD(ProcessInput)

    streamingContext.start()
    streamingContext.awaitTermination()
Пример #15
0
#https://issues.apache.org/jira/browse/PARQUET-222 - Parquet writer memory allocation
def process_proxy(time, rdd):
    output_rdd = rdd.map(lambda x: str(time) + ' ' + x[0]['host'] + ' ' + x[1]) \
        .filter(lambda x: '-net-bc' in x).map(parse) \
        .filter(lambda x: isinstance(x, Row)).repartition(10)
    return output_rdd


'''Main function'''
if __name__ == '__main__':
    appConfig = conf.Config()
    sc = SparkContext(conf=appConfig.setSparkConf())
    ssc = StreamingContext(sc, 600)
    logParser = Parser(type='flume')

    flumeStream = FlumeUtils.createStream(ssc, '10.129.4.182', 5141)
    flumeStream1 = FlumeUtils.createStream(ssc, '10.129.4.175', 5141)
    flumeStream2 = FlumeUtils.createStream(ssc, '10.129.4.174', 5141)
    flumeStream3 = FlumeUtils.createStream(ssc, '10.129.4.178', 5141)

    unionStream = flumeStream.union(flumeStream1).union(flumeStream3).union(flumeStream3)

    #fwDStream = flumeStream.transform(process_fw)
    proxyDStream = unionStream.transform(process_proxy)

    #fwDStream.foreachRDD(save_fw)
    proxyDStream.foreachRDD(save_proxy)
    #proxyDStream.saveAsTextFiles("sg_")

    '''
    genericRDD = rdd.filter(lambda x: any(y in x[0]['host'] for y in ['msr-off-fw', '-net-bc']) == False)
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import pyspark_init as pi
from pyspark.streaming.flume import FlumeUtils
import pyspark

ssc = pi.streaming_init('streaming_flume1', 'local[2]', 3)
host = 'localhost'
port = 44444
dsm = FlumeUtils.createStream(ssc, host, port,
                              pyspark.StorageLevel.MEMORY_AND_DISK_SER_2)
dsm.count().map(lambda x: 'Recieve ' + str(x) + ' Flume events!!!!').pprint()
ssc.start()
ssc.awaitTerminationOrTimeout(120)
ssc.stop()
# ‐*‐ coding: UTF‐8 ‐*‐
###spark streaming&&Flume
from pyspark import SparkContext
from pyspark.streaming import StreamingContext 
from pyspark.streaming.flume import FlumeUtils

sc=SparkContext("yarn","FlumeWordCount")

# 處理時間間隔為2秒
ssc=StreamingContext(sc,2)

# 開啟TCP socket ip & port
lines = FlumeUtils.createStream(ssc, "1.1.1.1",12345) lines1=lines.map(lambda x:x[1])

# 對兩秒內收到的字串做分割
words=lines1.flatMap(lambda line:line.split(" "))

# word count
pairs=words.map(lambda word:(word,1)) wordcounts=pairs.reduceByKey(lambda x,y:x+y)

# 輸出檔案至HDFS 格式為/tmp/flume‐日期
wordcounts.saveAsTextFiles("/tmp/flume")

# 檢查檔案內容
wordcounts.pprint()

# 啟動spark streaming
ssc.start()

# 等待計算終止
ssc.awaitTermination()
Пример #18
0
# from pyspark.streaming import StreamingContext
# from pyspark import SparkContext
# from pyspark.streaming.flume import FlumeUtils

# sc = SparkContext()
# ssc = StreamingContext(sc, 10)
# flumeStream = FlumeUtils.createStream(ssc, "localhost", 6669)

# result = flumeStream.map(lambda x: json.loads(x[1]))

# result.pprint()

# ssc.start()
# ssc.awaitTermination()

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.flume import FlumeUtils

sc = SparkContext(appName="PythonStreamingFlumeWordCount")
ssc = StreamingContext(sc, 10)

kvs = FlumeUtils.createStream(ssc, "localhost", int(6669))
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a+b)
counts.pprint()

ssc.start()
ssc.awaitTermination()
Пример #19
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.flume import FlumeUtils
import uuid
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: flume_wordcount.py <hostname> <port>", file=sys.stderr)
        sys.exit(-1)
    uid = str(uuid.uuid4())
    sc = SparkContext(appName="PythonStreamingFlumeWordCount")
    ssc = StreamingContext(sc, 1)

    hostname, port = sys.argv[1:]
    kvs = FlumeUtils.createStream(ssc, hostname, int(port))
    lines = kvs.map(lambda x: x[1])
    counts = lines.filter(lambda line: "sales" in line.lower())\
    .map(lambda word: (uid, word)) \
                  .reduceByKey(lambda a, b: a+b) \
    .saveAsTextFiles('hdfs://0.0.0.0:8020/weblogs/sales_','txt')

    ssc.start()
    ssc.awaitTermination()
Пример #20
0
        print("Usage: flume_wordcount.py <hostname> <port>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonStreamingFlumeWordCount")
    ssc = StreamingContext(sc, 1)
# Create regular expression pattern to parse log messages' mainbody.
    PATTERN = '^(\S*\s\S*\s\S*)(.*)'

    def parseLogLine(logline):
        match = re.search(PATTERN, logline)
        return (Row(
            date_time=match.group(1),
            mainbody=match.group(2),
        ), 1)
# Pairwise count each lines,then print out.
    hostname, port = sys.argv[1:]
    kvs = FlumeUtils.createStream(ssc, hostname, int(port))
    lines = kvs.map(lambda x: x[1])
    Errorcounts = (lines.map(parseLogLine)
                   .filter(lambda s: s[1] == 1)
                   .map(lambda s: s[0].mainbody)
                   .filter(lambda s: "ERROR" in s)
                   .map(lambda log: (log, 1))
                   .reduceByKey(lambda a, b: a + b))
    Warningcounts = (lines.map(parseLogLine)
                     .filter(lambda s: s[1] == 1)
                     .map(lambda s: s[0].mainbody)
                     .filter(lambda s: "WARNING" in s)
                     .map(lambda log: (log, 1))
                     .reduceByKey(lambda a, b: a + b))
    Errorcounts.pprint()
    Warningcounts.pprint()
Пример #21
0
    exp = pattern.match(line)
    if exp:
        request = exp.groupdict()["request"]
        if request:
            requestFields = request.split()
            if (len(requestFields) > 1):
                return requestFields[1]


if __name__ == "__main__":

    sc = SparkContext(appName="StreamingFlumeLogAggregator")
    sc.setLogLevel("ERROR")
    ssc = StreamingContext(sc, 1)

    flumeStream = FlumeUtils.createStream(
        ssc, "localhost", 9092)  # DStream Object named flumeStream

    lines = flumeStream.map(lambda x: x[1])
    urls = lines.map(extractURLRequest)

    # Reduce by URL over a 5-minute window sliding every second
    urlCounts = urls.map(lambda x: (x, 1)).reduceByKeyAndWindow(
        lambda x, y: x + y, lambda x, y: x - y, 300, 1)

    # Sort and print the results
    sortedResults = urlCounts.transform(
        lambda rdd: rdd.sortBy(lambda x: x[1], False))
    sortedResults.pprint()

    ssc.checkpoint("/home/maria_dev/checkpoint")
    ssc.start()
Пример #22
0
from pyspark.streaming.flume import FlumeUtils
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


def salvaResultado(rdd):
    #a=rdd.foreach(lambda dado: dado.sortByKey())
    a=rdd.sortByKey().collect()
    np.savetxt('/home/marceloca/teste',a, fmt='%s')
    #rdd.foreach(lambda dado: np.savetxt('/home/marceloca/teste', dado, fmt='%s'))

# Create a local StreamingContext with two working thread and batch interval of 1 second

sc = SparkContext("local[2]", "acessos")
ssc = StreamingContext(sc, 10)
stream_flume_logs = FlumeUtils.createStream(ssc, "192.168.0.13", 44444) 

#Definicao de dict para transformacao de data

cal = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr' : '04', \
'May' : '05', 'Jun' : '06', 'Jul' : '07', 'Aug' : '08', 'Sep' : '09', \
'Oct' : '10', 'Nov' : '11', 'Dec' : '12' }


#Pegar cada linha do log
linha_log = stream_flume_logs.map(lambda a: a[1])
#words = linha_log.flatMap(lambda line: line.split(" "))
words = linha_log.map(lambda line: line.split(" "))

#Extrair a data de cada acesso
datas = words.map(lambda data: str(cal[data[0]]) + str(data[1])) 
from operator import add
import sys

host_name = sys.argv[1]
port      = int(sys.argv[2])

conf = SparkConf(). \
setAppName("streaming_department_count"). \
setMaster("yarn-client")

sc = SparkContext(conf= conf)
ssc = StreamingContext(sc, 30)

agents = [(host_name, port)]
polling_stream = FlumeUtils.createPollingStream(ssc, agents)
messages = polling_stream.map(lambda msg: msg[1])

department_msg = messages. \
filter(lambda msg: msg.split(" ")[6].split("/")[1] == "department")

department_names = department_msg. \
map(lambda msg: (msg.split(" ")[6].split("/")[2], 1))

department_conut = department_names. \
reduceByKey(add)

output_prefix = sys.argv[3]
department_conut.saveAsTextFiles(output_prefix)

ssc.start()
Пример #24
0
import os

os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-flume-sink_2.11:2.1.0,org.apache.spark:spark-streaming-flume_2.11:2.1.0 pyspark-shell'

from pyspark.streaming.flume import FlumeUtils
from pyspark.streaming import StreamingContext
from pyspark import SparkContext

sc = SparkContext(appName="PythonSparkStreamingFlume")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 10)

streamingContext = StreamingContext(sc, 5)
addresses = [("IPADDRESS", 2727)]
flumeStream = FlumeUtils.createPollingStream(streamingContext, addresses)

lines = flumeStream.map(lambda x: x[1].split(","))
lines.pprint()

streamingContext.start()
streamingContext.awaitTermination()