def build_read_pipeline(self, pipeline, max_num_records=None): kafka_records = (pipeline | 'ReadFromKafka' >> ReadFromKafka( consumer_config={ 'bootstrap.servers': self.bootstrap_servers, 'auto.offset.reset': 'earliest' }, topics=[self.topic], max_num_records=max_num_records, expansion_service=self.expansion_service)) if max_num_records: return kafka_records return ( kafka_records | 'Windowing' >> beam.WindowInto( beam.window.FixedWindows(300), trigger=beam.transforms.trigger.AfterProcessingTime(60), accumulation_mode=beam.transforms.trigger.AccumulationMode. DISCARDING) | 'DecodingValue' >> beam.Map(lambda elem: int(elem[1].decode())) | 'CombineGlobally' >> beam.CombineGlobally(sum).without_defaults() | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
def test_expand_kafka_read(self): # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka( consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=self.get_expansion_service())) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception))
def test_external_transforms(self): # TODO Move expansion address resides into PipelineOptions def get_expansion_service(): return "localhost:" + str(self.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence( start=1, stop=10, expansion_service=get_expansion_service())) assert_that(res, equal_to([i for i in range(1, 10)])) # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=get_expansion_service())) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception)) # We just test the expansion but do not execute. # pylint: disable=expression-not-assigned (self.create_pipeline() | Impulse() | Map(lambda input: (1, input)) | WriteToKafka(producer_config={ 'bootstrap.servers': 'localhost:9092, notvalid2:3531' }, topic='topic1', key_serializer='org.apache.kafka.' 'common.serialization.' 'LongSerializer', value_serializer='org.apache.kafka.' 'common.serialization.' 'ByteArraySerializer', expansion_service=get_expansion_service()))
def main(argv=None): options = PipelineOptions(argv) kafka_options = options.view_as(KafkaReadOptions) p = Pipeline(options=options) (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': kafka_options.bootstrap_servers }, topics=[kafka_options.topic]) | Map(lambda x: logging.info('kafka element: %s', x))) p.run()
def build_read_pipeline(self, pipeline, max_num_records=None): kafka_records = (pipeline | 'ReadFromKafka' >> ReadFromKafka( consumer_config={ 'bootstrap.servers': self.bootstrap_servers, 'auto.offset.reset': 'earliest' }, topics=[self.topic], max_num_records=max_num_records, expansion_service=self.expansion_service)) if max_num_records: return kafka_records return (kafka_records | 'CalculateSum' >> beam.ParDo(CollectingFn()) | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
def run(bootstrap_servers, topic, pipeline_args): # bootstrap_servers = '123.45.67.89:123:9092' # topic = 'kafka_taxirides_realtime' # pipeline_args = ['--project', 'my-project', # '--runner', 'DataflowRunner', # '--temp_location', 'my-temp-location', # '--region', 'my-region', # '--num_workers', 'my-num-workers', # '--experiments', 'use_runner_v2'] pipeline_options = PipelineOptions(pipeline_args, save_main_session=True, streaming=True) window_size = 15 # size of the Window in seconds. def log_ride(ride_bytes): # Converting bytes record from Kafka to a dictionary. import ast ride = ast.literal_eval(ride_bytes.decode("UTF-8")) logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers', ride['latitude'], ride['longitude'], ride['passenger_count']) with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime'). with_output_types(bytes) | beam.Map(lambda x: (b'', x)).with_output_types( typing.Tuple[bytes, bytes]) # Kafka write transforms expects KVs. | beam.WindowInto(beam.window.FixedWindows(window_size)) | WriteToKafka( producer_config={'bootstrap.servers': bootstrap_servers}, topic=topic)) _ = (pipeline | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrap_servers}, topics=[topic]) | beam.FlatMap(lambda kv: log_ride(kv[1])))
def toPositions(x, stamp=beam.DoFn.TimestampParam): return (x[0].decode("utf-8"), tuple(map(float, x[1].decode("utf-8").split(" "))) + (stamp.micros / 1000., )) if __name__ == "__main__": if len(sys.argv) < 4: usage() bootstrapServer, inputTopic, outputTopic = sys.argv[1:4] with beam.Pipeline(options=PipelineOptions(["--streaming"] + sys.argv[4:])) as p: (p | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrapServer}, topics=[inputTopic], timestamp_policy=ReadFromKafka.create_time_policy, expansion_service=get_expansion_service()) | "ToPositions" >> beam.Map(toPositions) | "SportTracker" >> SportTrackerCalc() | "ToKv" >> beam.Map(toKv) | "StoreOutput" >> WriteToKafka(producer_config={'bootstrap.servers': bootstrapServer}, topic=outputTopic, key_serializer= "org.apache.kafka.common.serialization.StringSerializer", value_serializer= "org.apache.kafka.common.serialization.StringSerializer", expansion_service=get_expansion_service()))
| beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=window.Duration.of(0)) | "MaxLength" >> beam.combiners.Top.Of(1, key=len).without_defaults() | "Flatten" >> beam.FlatMap(lambda x: x)) def toKv(s: str) -> beam.typehints.KV[bytes, bytes]: return ("".encode("utf-8"), s.encode("utf-8")) if __name__ == "__main__": if len(sys.argv) < 4: usage() bootstrapServer, inputTopic, outputTopic = sys.argv[1:4] with beam.Pipeline(options=PipelineOptions(["--streaming"] + sys.argv[4:])) as p: (p | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrapServer}, topics=[inputTopic], expansion_service=get_expansion_service()) | "ToLines" >> beam.Map(lambda x: "%s %s" % (x[0].decode("utf-8"), x[1].decode("utf-8"))) | "ComputeLongestWord" >> ComputeLongestWord() | beam.Map(toKv) | "StoreOutput" >> WriteToKafka( producer_config={'bootstrap.servers': bootstrapServer}, topic=outputTopic, expansion_service=get_expansion_service()))
def run(bootstrap_servers, topic, with_metadata, bq_dataset, bq_table_name, project, pipeline_options): # bootstrap_servers = '123.45.67.89:123:9092' # topic = 'kafka_taxirides_realtime' # pipeline_args = ['--project', 'my-project', # '--runner', 'DataflowRunner', # '--temp_location', 'my-temp-location', # '--region', 'my-region', # '--num_workers', 'my-num-workers', # '--experiments', 'use_runner_v2'] window_size = 15 # size of the Window in seconds. def log_ride(ride): if 'timestamp' in ride: logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers at timestamp %r', ride['latitude'], ride['longitude'], ride['passenger_count'], ride['timestamp']) else: logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers', ride['latitude'], ride['longitude'], ride['passenger_count']) def convert_kafka_record_to_dictionary(record): # the records have 'value' attribute when --with_metadata is given if hasattr(record, 'value'): ride_bytes = record.value elif isinstance(record, tuple): ride_bytes = record[1] else: raise RuntimeError('unknown record type: %s' % type(record)) # Converting bytes record from Kafka to a dictionary. import ast ride = ast.literal_eval(ride_bytes.decode("UTF-8")) output = { key: ride[key] for key in ['latitude', 'longitude', 'passenger_count'] } if hasattr(record, 'timestamp'): # timestamp is read from Kafka metadata output['timestamp'] = record.timestamp return output with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime'). with_output_types(bytes) | beam.Map(lambda x: (b'', x)).with_output_types( typing.Tuple[bytes, bytes]) # Kafka write transforms expects KVs. | beam.WindowInto(beam.window.FixedWindows(window_size)) | WriteToKafka( producer_config={'bootstrap.servers': bootstrap_servers}, topic=topic)) ride_col = ( pipeline | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrap_servers}, topics=[topic], with_metadata=with_metadata) | beam.Map(lambda record: convert_kafka_record_to_dictionary(record)) ) if bq_dataset: schema = 'latitude:STRING,longitude:STRING,passenger_count:INTEGER' if with_metadata: schema += ',timestamp:STRING' _ = (ride_col | beam.io.WriteToBigQuery(bq_table_name, bq_dataset, project, schema)) else: _ = ride_col | beam.FlatMap(lambda ride: log_ride(ride))