Пример #1
0
 def test_ConsumeFromKafka(self):
     kafka_config = {"topic": "test_stream",
                     "bootstrap_servers": "localhost:9092",
                     "group_id": "test_group"}
     #create a streaming Kafka consumer
     with TestPipeline() as p:
         p | "Consume kafka messages" >> kafkaio.KafkaConsume(kafka_config)
Пример #2
0
def run(bootstrap_servers, topic, project, dataset, table):
    kafka_config = {
        "topic": topic,
        "bootstrap_servers": bootstrap_servers,
        "group_id": "debezium_consumer_group"
    }

    mapping_schema = {
        "sku":
        lambda data: data.payload.after.sku
        if data.payload.op != 'd' else data.payload.before.sku,
        "name":
        lambda data: data.payload.after.name
        if data.payload.op != 'd' else data.payload.before.name,
        "price":
        lambda data: data.payload.after.price
        if data.payload.op != 'd' else data.payload.before.price,
        "quantity":
        lambda data: data.payload.after.available
        if data.payload.op != 'd' else data.payload.before.available,
        "timestamp":
        lambda data: datetime.datetime.utcfromtimestamp(data.payload.ts_ms /
                                                        1000).isoformat(),
        "deleted":
        lambda data: True if data.payload.op == 'd' else False
    }

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=pipeline_options)

    _ = (p | 'Reading messages' >> kafkaio.KafkaConsume(kafka_config)
         | 'Preparing data' >> beam.ParDo(TransformSchema(mapping_schema))
         | 'Writing data to BigQuery' >> beam.ParDo(
             WriteToBigQuery(dataset, project, table)))
    result = p.run()
    result.wait_until_finish()
def main():

    #TODO: Fix this wait
    #sleep until hbase and kafka are up
    time.sleep(60)

    useBeam = True

    #TODO: check first if kafka topic exist?
    #TODO: check kafka connection
    #TODO: include kafka group ID
    if useBeam:
        #TODO: Check HBase connection
        #Check if table exist and create it otherwise
        conn = hb.Connection(hbHost, hbPort)
        if not hbTableName.encode('utf-8') in conn.tables():
            conn.create_table(hbTableName, hbFamilies)
        conn.close()

        #Define kafka configuration
        kafka_config = {
            "topic": kafkaTopic,
            "bootstrap_servers": kafkaServers
        }  #,"group_id":kafkaGrId

        #Streaming pipelines
        with beam.Pipeline(options=PipelineOptions()) as p:
            #3 pipelines: Metadata&Subject, Content&Label, WordCount

            inputTuples = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume(
                kafka_config)

            content = (inputTuples
                       | "Extract content" >> beam.Map(extract_mailContent))
            #TODO: filter empty content mails????  | "filter empy content" >> beam.Filter(is_ContentNotEmpty))

            classifiedContent = content | "Classify as SPAM/HAM and store" >> beam.Map(
                classifyMail)

            wordC = (
                content | "Clean content" >> beam.Map(cleanContent)
                #TODO: word count exploiting beam(window strategy?)
                #| 'Fixed-size windows' >> beam.WindowInto()
                #| "Word" >> .....
                #| "Count" >> beam.combiners.Count.PerElement()
                | "Count and store" >> beam.Map(countWordsContent))

            metadata = (inputTuples
                        | "Extract metadata" >> beam.Map(extract_mailMetadata)
                        | "Extract subject and store" >>
                        beam.Map(extract_subjectMetadata))

            #| 'Writing to stdout' >> beam.Map(print))

    else:
        #Create Kafka consumer
        consumer = KafkaConsumer(
            kafkaTopic, bootstrap_servers=kafkaServers)  #group_id = kafkaGrId

        #Receive and store kafka data
        dataCollected = []
        for message in consumer:
            dataCollected.append((message.key, message.value))
            print(message.key)
Пример #4
0
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

from beam_nuggets.io import kafkaio

consumer_config = {"topic": "notifications",
                   "bootstrap_servers": "localhost:9092",
                   "group_id": "notification_consumer_group"}

with beam.Pipeline(options=PipelineOptions()) as p:
    notifications = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume(
        consumer_config=consumer_config,
        value_decoder=bytes.decode,  # optional
    )
    notifications | 'Writing to stdout' >> beam.Map(print)
Пример #5
0
# Process
from __future__ import print_function
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os
from apache_beam import window
from apache_beam.transforms.trigger import AfterWatermark, AfterProcessingTime, AccumulationMode, AfterCount
from beam_nuggets.io import kafkaio

#with beam.Pipeline(options=PipelineOptions()) as p:
#    notifications = (p
#                     | "Creating data" >> beam.Create([('dev_1', '{"device": "0001", status": "healthy"}')])
#                     | "Pushing messages to Kafka" >> kafkaio.KafkaProduce(
#                                                                            topic='ORIG',
#                                                                            servers="localhost:9092"
#                                                                        )
#                    )
#    notifications | 'Writing to stdout' >> beam.Map(print)

kafka_topic = "ORIG"
kafka_config = {
    "topic": kafka_topic,
    "bootstrap_servers": "localhost:9092",
    "group_id": "notification_consumer_group"
}

with beam.Pipeline(options=PipelineOptions()) as p:
    notifications = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume(
        kafka_config)
    notifications | 'Writing to stdout' >> beam.Map(print)