示例#1
0
    def build_read_pipeline(self, pipeline, max_num_records=None):
        kafka_records = (pipeline
                         | 'ReadFromKafka' >> ReadFromKafka(
                             consumer_config={
                                 'bootstrap.servers': self.bootstrap_servers,
                                 'auto.offset.reset': 'earliest'
                             },
                             topics=[self.topic],
                             max_num_records=max_num_records,
                             expansion_service=self.expansion_service))

        if max_num_records:
            return kafka_records

        return (
            kafka_records
            | 'Windowing' >> beam.WindowInto(
                beam.window.FixedWindows(300),
                trigger=beam.transforms.trigger.AfterProcessingTime(60),
                accumulation_mode=beam.transforms.trigger.AccumulationMode.
                DISCARDING)
            | 'DecodingValue' >> beam.Map(lambda elem: int(elem[1].decode()))
            |
            'CombineGlobally' >> beam.CombineGlobally(sum).without_defaults()
            | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
示例#2
0
 def test_expand_kafka_read(self):
     # We expect to fail here because we do not have a Kafka cluster handy.
     # Nevertheless, we check that the transform is expanded by the
     # ExpansionService and that the pipeline fails during execution.
     with self.assertRaises(Exception) as ctx:
         with self.create_pipeline() as p:
             # pylint: disable=expression-not-assigned
             (p
              | ReadFromKafka(
                  consumer_config={
                      'bootstrap.servers': 'notvalid1:7777, notvalid2:3531'
                  },
                  topics=['topic1', 'topic2'],
                  key_deserializer='org.apache.kafka.'
                  'common.serialization.'
                  'ByteArrayDeserializer',
                  value_deserializer='org.apache.kafka.'
                  'common.serialization.'
                  'LongDeserializer',
                  expansion_service=self.get_expansion_service()))
     self.assertTrue(
         'No resolvable bootstrap urls given in bootstrap.servers'
         in str(ctx.exception),
         'Expected to fail due to invalid bootstrap.servers, but '
         'failed due to:\n%s' % str(ctx.exception))
示例#3
0
        def test_external_transforms(self):
            # TODO Move expansion address resides into PipelineOptions
            def get_expansion_service():
                return "localhost:" + str(self.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(
                           start=1,
                           stop=10,
                           expansion_service=get_expansion_service()))

                assert_that(res, equal_to([i for i in range(1, 10)]))

            # We expect to fail here because we do not have a Kafka cluster handy.
            # Nevertheless, we check that the transform is expanded by the
            # ExpansionService and that the pipeline fails during execution.
            with self.assertRaises(Exception) as ctx:
                with self.create_pipeline() as p:
                    # pylint: disable=expression-not-assigned
                    (p
                     |
                     ReadFromKafka(consumer_config={
                         'bootstrap.servers':
                         'notvalid1:7777, notvalid2:3531'
                     },
                                   topics=['topic1', 'topic2'],
                                   key_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'ByteArrayDeserializer',
                                   value_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'LongDeserializer',
                                   expansion_service=get_expansion_service()))
            self.assertTrue(
                'No resolvable bootstrap urls given in bootstrap.servers'
                in str(ctx.exception),
                'Expected to fail due to invalid bootstrap.servers, but '
                'failed due to:\n%s' % str(ctx.exception))

            # We just test the expansion but do not execute.
            # pylint: disable=expression-not-assigned
            (self.create_pipeline()
             | Impulse()
             | Map(lambda input: (1, input))
             | WriteToKafka(producer_config={
                 'bootstrap.servers':
                 'localhost:9092, notvalid2:3531'
             },
                            topic='topic1',
                            key_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'LongSerializer',
                            value_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'ByteArraySerializer',
                            expansion_service=get_expansion_service()))
示例#4
0
def main(argv=None):
    options = PipelineOptions(argv)
    kafka_options = options.view_as(KafkaReadOptions)

    p = Pipeline(options=options)
    (p
     | ReadFromKafka(consumer_config={
         'bootstrap.servers': kafka_options.bootstrap_servers
     },
                     topics=[kafka_options.topic])
     | Map(lambda x: logging.info('kafka element: %s', x)))

    p.run()
示例#5
0
    def build_read_pipeline(self, pipeline, max_num_records=None):
        kafka_records = (pipeline
                         | 'ReadFromKafka' >> ReadFromKafka(
                             consumer_config={
                                 'bootstrap.servers': self.bootstrap_servers,
                                 'auto.offset.reset': 'earliest'
                             },
                             topics=[self.topic],
                             max_num_records=max_num_records,
                             expansion_service=self.expansion_service))

        if max_num_records:
            return kafka_records

        return (kafka_records
                | 'CalculateSum' >> beam.ParDo(CollectingFn())
                | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
示例#6
0
def run(bootstrap_servers, topic, pipeline_args):
    # bootstrap_servers = '123.45.67.89:123:9092'
    # topic = 'kafka_taxirides_realtime'
    # pipeline_args = ['--project', 'my-project',
    #                  '--runner', 'DataflowRunner',
    #                  '--temp_location', 'my-temp-location',
    #                  '--region', 'my-region',
    #                  '--num_workers', 'my-num-workers',
    #                  '--experiments', 'use_runner_v2']

    pipeline_options = PipelineOptions(pipeline_args,
                                       save_main_session=True,
                                       streaming=True)
    window_size = 15  # size of the Window in seconds.

    def log_ride(ride_bytes):
        # Converting bytes record from Kafka to a dictionary.
        import ast
        ride = ast.literal_eval(ride_bytes.decode("UTF-8"))
        logging.info(
            'Found ride at latitude %r and longitude %r with %r '
            'passengers', ride['latitude'], ride['longitude'],
            ride['passenger_count'])

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime').
            with_output_types(bytes)
            | beam.Map(lambda x: (b'', x)).with_output_types(
                typing.Tuple[bytes,
                             bytes])  # Kafka write transforms expects KVs.
            | beam.WindowInto(beam.window.FixedWindows(window_size))
            | WriteToKafka(
                producer_config={'bootstrap.servers': bootstrap_servers},
                topic=topic))

        _ = (pipeline
             | ReadFromKafka(
                 consumer_config={'bootstrap.servers': bootstrap_servers},
                 topics=[topic])
             | beam.FlatMap(lambda kv: log_ride(kv[1])))
def toPositions(x, stamp=beam.DoFn.TimestampParam):
    return (x[0].decode("utf-8"),
            tuple(map(float, x[1].decode("utf-8").split(" "))) +
            (stamp.micros / 1000., ))


if __name__ == "__main__":
    if len(sys.argv) < 4:
        usage()

    bootstrapServer, inputTopic, outputTopic = sys.argv[1:4]

    with beam.Pipeline(options=PipelineOptions(["--streaming"] +
                                               sys.argv[4:])) as p:
        (p | ReadFromKafka(
            consumer_config={'bootstrap.servers': bootstrapServer},
            topics=[inputTopic],
            timestamp_policy=ReadFromKafka.create_time_policy,
            expansion_service=get_expansion_service())
         | "ToPositions" >> beam.Map(toPositions)
         | "SportTracker" >> SportTrackerCalc()
         | "ToKv" >> beam.Map(toKv)
         | "StoreOutput" >>
         WriteToKafka(producer_config={'bootstrap.servers': bootstrapServer},
                      topic=outputTopic,
                      key_serializer=
                      "org.apache.kafka.common.serialization.StringSerializer",
                      value_serializer=
                      "org.apache.kafka.common.serialization.StringSerializer",
                      expansion_service=get_expansion_service()))
示例#8
0
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
          accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
          allowed_lateness=window.Duration.of(0))
      | "MaxLength" >> beam.combiners.Top.Of(1, key=len).without_defaults()
      | "Flatten" >> beam.FlatMap(lambda x: x))

def toKv(s: str) -> beam.typehints.KV[bytes, bytes]:
  return ("".encode("utf-8"), s.encode("utf-8"))

if __name__ == "__main__":
  if len(sys.argv) < 4:
    usage()

  bootstrapServer, inputTopic, outputTopic = sys.argv[1:4]

  with beam.Pipeline(options=PipelineOptions(["--streaming"] + sys.argv[4:])) as p:
    (p | ReadFromKafka(
        consumer_config={'bootstrap.servers': bootstrapServer},
        topics=[inputTopic],
        expansion_service=get_expansion_service())
     | "ToLines" >> beam.Map(lambda x: "%s %s" % (x[0].decode("utf-8"), x[1].decode("utf-8")))
     | "ComputeLongestWord" >> ComputeLongestWord()
     | beam.Map(toKv)
     | "StoreOutput" >> WriteToKafka(
         producer_config={'bootstrap.servers': bootstrapServer},
         topic=outputTopic,
         expansion_service=get_expansion_service()))

示例#9
0
def run(bootstrap_servers, topic, with_metadata, bq_dataset, bq_table_name,
        project, pipeline_options):
    # bootstrap_servers = '123.45.67.89:123:9092'
    # topic = 'kafka_taxirides_realtime'
    # pipeline_args = ['--project', 'my-project',
    #                  '--runner', 'DataflowRunner',
    #                  '--temp_location', 'my-temp-location',
    #                  '--region', 'my-region',
    #                  '--num_workers', 'my-num-workers',
    #                  '--experiments', 'use_runner_v2']

    window_size = 15  # size of the Window in seconds.

    def log_ride(ride):
        if 'timestamp' in ride:
            logging.info(
                'Found ride at latitude %r and longitude %r with %r '
                'passengers at timestamp %r', ride['latitude'],
                ride['longitude'], ride['passenger_count'], ride['timestamp'])
        else:
            logging.info(
                'Found ride at latitude %r and longitude %r with %r '
                'passengers', ride['latitude'], ride['longitude'],
                ride['passenger_count'])

    def convert_kafka_record_to_dictionary(record):
        # the records have 'value' attribute when --with_metadata is given
        if hasattr(record, 'value'):
            ride_bytes = record.value
        elif isinstance(record, tuple):
            ride_bytes = record[1]
        else:
            raise RuntimeError('unknown record type: %s' % type(record))
        # Converting bytes record from Kafka to a dictionary.
        import ast
        ride = ast.literal_eval(ride_bytes.decode("UTF-8"))
        output = {
            key: ride[key]
            for key in ['latitude', 'longitude', 'passenger_count']
        }
        if hasattr(record, 'timestamp'):
            # timestamp is read from Kafka metadata
            output['timestamp'] = record.timestamp
        return output

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime').
            with_output_types(bytes)
            | beam.Map(lambda x: (b'', x)).with_output_types(
                typing.Tuple[bytes,
                             bytes])  # Kafka write transforms expects KVs.
            | beam.WindowInto(beam.window.FixedWindows(window_size))
            | WriteToKafka(
                producer_config={'bootstrap.servers': bootstrap_servers},
                topic=topic))

        ride_col = (
            pipeline
            | ReadFromKafka(
                consumer_config={'bootstrap.servers': bootstrap_servers},
                topics=[topic],
                with_metadata=with_metadata)
            |
            beam.Map(lambda record: convert_kafka_record_to_dictionary(record))
        )

        if bq_dataset:
            schema = 'latitude:STRING,longitude:STRING,passenger_count:INTEGER'
            if with_metadata:
                schema += ',timestamp:STRING'
            _ = (ride_col
                 | beam.io.WriteToBigQuery(bq_table_name, bq_dataset, project,
                                           schema))
        else:
            _ = ride_col | beam.FlatMap(lambda ride: log_ride(ride))