def test_ConsumeFromKafka(self): kafka_config = {"topic": "test_stream", "bootstrap_servers": "localhost:9092", "group_id": "test_group"} #create a streaming Kafka consumer with TestPipeline() as p: p | "Consume kafka messages" >> kafkaio.KafkaConsume(kafka_config)
def run(bootstrap_servers, topic, project, dataset, table): kafka_config = { "topic": topic, "bootstrap_servers": bootstrap_servers, "group_id": "debezium_consumer_group" } mapping_schema = { "sku": lambda data: data.payload.after.sku if data.payload.op != 'd' else data.payload.before.sku, "name": lambda data: data.payload.after.name if data.payload.op != 'd' else data.payload.before.name, "price": lambda data: data.payload.after.price if data.payload.op != 'd' else data.payload.before.price, "quantity": lambda data: data.payload.after.available if data.payload.op != 'd' else data.payload.before.available, "timestamp": lambda data: datetime.datetime.utcfromtimestamp(data.payload.ts_ms / 1000).isoformat(), "deleted": lambda data: True if data.payload.op == 'd' else False } pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) _ = (p | 'Reading messages' >> kafkaio.KafkaConsume(kafka_config) | 'Preparing data' >> beam.ParDo(TransformSchema(mapping_schema)) | 'Writing data to BigQuery' >> beam.ParDo( WriteToBigQuery(dataset, project, table))) result = p.run() result.wait_until_finish()
def main(): #TODO: Fix this wait #sleep until hbase and kafka are up time.sleep(60) useBeam = True #TODO: check first if kafka topic exist? #TODO: check kafka connection #TODO: include kafka group ID if useBeam: #TODO: Check HBase connection #Check if table exist and create it otherwise conn = hb.Connection(hbHost, hbPort) if not hbTableName.encode('utf-8') in conn.tables(): conn.create_table(hbTableName, hbFamilies) conn.close() #Define kafka configuration kafka_config = { "topic": kafkaTopic, "bootstrap_servers": kafkaServers } #,"group_id":kafkaGrId #Streaming pipelines with beam.Pipeline(options=PipelineOptions()) as p: #3 pipelines: Metadata&Subject, Content&Label, WordCount inputTuples = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume( kafka_config) content = (inputTuples | "Extract content" >> beam.Map(extract_mailContent)) #TODO: filter empty content mails???? | "filter empy content" >> beam.Filter(is_ContentNotEmpty)) classifiedContent = content | "Classify as SPAM/HAM and store" >> beam.Map( classifyMail) wordC = ( content | "Clean content" >> beam.Map(cleanContent) #TODO: word count exploiting beam(window strategy?) #| 'Fixed-size windows' >> beam.WindowInto() #| "Word" >> ..... #| "Count" >> beam.combiners.Count.PerElement() | "Count and store" >> beam.Map(countWordsContent)) metadata = (inputTuples | "Extract metadata" >> beam.Map(extract_mailMetadata) | "Extract subject and store" >> beam.Map(extract_subjectMetadata)) #| 'Writing to stdout' >> beam.Map(print)) else: #Create Kafka consumer consumer = KafkaConsumer( kafkaTopic, bootstrap_servers=kafkaServers) #group_id = kafkaGrId #Receive and store kafka data dataCollected = [] for message in consumer: dataCollected.append((message.key, message.value)) print(message.key)
import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from beam_nuggets.io import kafkaio consumer_config = {"topic": "notifications", "bootstrap_servers": "localhost:9092", "group_id": "notification_consumer_group"} with beam.Pipeline(options=PipelineOptions()) as p: notifications = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume( consumer_config=consumer_config, value_decoder=bytes.decode, # optional ) notifications | 'Writing to stdout' >> beam.Map(print)
# Process from __future__ import print_function import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions import os from apache_beam import window from apache_beam.transforms.trigger import AfterWatermark, AfterProcessingTime, AccumulationMode, AfterCount from beam_nuggets.io import kafkaio #with beam.Pipeline(options=PipelineOptions()) as p: # notifications = (p # | "Creating data" >> beam.Create([('dev_1', '{"device": "0001", status": "healthy"}')]) # | "Pushing messages to Kafka" >> kafkaio.KafkaProduce( # topic='ORIG', # servers="localhost:9092" # ) # ) # notifications | 'Writing to stdout' >> beam.Map(print) kafka_topic = "ORIG" kafka_config = { "topic": kafka_topic, "bootstrap_servers": "localhost:9092", "group_id": "notification_consumer_group" } with beam.Pipeline(options=PipelineOptions()) as p: notifications = p | "Reading messages from Kafka" >> kafkaio.KafkaConsume( kafka_config) notifications | 'Writing to stdout' >> beam.Map(print)