Пример #1
0
        def test_external_transforms(self):
            # TODO Move expansion address resides into PipelineOptions
            def get_expansion_service():
                return "localhost:" + str(self.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(
                           start=1,
                           stop=10,
                           expansion_service=get_expansion_service()))

                assert_that(res, equal_to([i for i in range(1, 10)]))

            # We expect to fail here because we do not have a Kafka cluster handy.
            # Nevertheless, we check that the transform is expanded by the
            # ExpansionService and that the pipeline fails during execution.
            with self.assertRaises(Exception) as ctx:
                with self.create_pipeline() as p:
                    # pylint: disable=expression-not-assigned
                    (p
                     |
                     ReadFromKafka(consumer_config={
                         'bootstrap.servers':
                         'notvalid1:7777, notvalid2:3531'
                     },
                                   topics=['topic1', 'topic2'],
                                   key_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'ByteArrayDeserializer',
                                   value_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'LongDeserializer',
                                   expansion_service=get_expansion_service()))
            self.assertTrue(
                'No resolvable bootstrap urls given in bootstrap.servers'
                in str(ctx.exception),
                'Expected to fail due to invalid bootstrap.servers, but '
                'failed due to:\n%s' % str(ctx.exception))

            # We just test the expansion but do not execute.
            # pylint: disable=expression-not-assigned
            (self.create_pipeline()
             | Impulse()
             | Map(lambda input: (1, input))
             | WriteToKafka(producer_config={
                 'bootstrap.servers':
                 'localhost:9092, notvalid2:3531'
             },
                            topic='topic1',
                            key_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'LongSerializer',
                            value_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'ByteArraySerializer',
                            expansion_service=get_expansion_service()))
Пример #2
0
 def build_read_pipeline(self, pipeline):
     _ = (pipeline
          | 'ReadFromKafka' >> ReadFromKafka(
              consumer_config={
                  'bootstrap.servers': self.bootstrap_servers,
                  'auto.offset.reset': 'earliest'
              },
              topics=[self.topic],
              expansion_service=self.expansion_service)
          | 'Windowing' >> beam.WindowInto(
              beam.window.FixedWindows(300),
              trigger=beam.transforms.trigger.AfterProcessingTime(60),
              accumulation_mode=beam.transforms.trigger.AccumulationMode.
              DISCARDING)
          | 'DecodingValue' >> beam.Map(lambda elem: int(elem[1].decode()))
          |
          'CombineGlobally' >> beam.CombineGlobally(sum).without_defaults()
          | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
Пример #3
0
        def test_external_transforms(self):
            options = self.create_options()
            options._all_options['parallelism'] = 1
            options._all_options['streaming'] = True

            expansion_address = "localhost:" + str(
                FlinkRunnerTest.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(start=1,
                                          stop=10,
                                          expansion_service=expansion_address))

                assert_that(res, equal_to([i for i in range(1, 10)]))

            # We expect to fail here because we do not have a Kafka cluster handy.
            # Nevertheless, we check that the transform is expanded by the
            # ExpansionService and that the pipeline fails during execution.
            with self.assertRaises(Exception) as ctx:
                with self.create_pipeline() as p:
                    # pylint: disable=expression-not-assigned
                    (p
                     | ReadFromKafka(consumer_config={
                         'bootstrap.servers':
                         'notvalid1:7777, notvalid2:3531'
                     },
                                     topics=['topic1', 'topic2'],
                                     key_deserializer='org.apache.kafka.'
                                     'common.serialization.'
                                     'ByteArrayDeserializer',
                                     value_deserializer='org.apache.kafka.'
                                     'common.serialization.'
                                     'LongDeserializer',
                                     expansion_service=expansion_address))
            self.assertTrue(
                'No resolvable bootstrap urls given in bootstrap.servers'
                in str(ctx.exception),
                'Expected to fail due to invalid bootstrap.servers, but '
                'failed due to:\n%s' % str(ctx.exception))
Пример #4
0
def run():
    options = PipelineOptions([
        "--runner=PortableRunner", "--job_endpoint=localhost:8099",
        "--environment_type=LOOPBACK"
    ])
    # options = PipelineOptions([
    #     "--runner=FlinkRunner",
    #     "--flink_master=localhost:8081",
    # ])
    with beam.Pipeline(options=options) as p:
        (p | 'ReadFromKafka' >> ReadFromKafka(
            consumer_config={"bootstrap.servers": "localhost:9092"},
            topics=["beam-input"])
         | 'ExtractWords' >>
         beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1]))
         | 'Window' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.Repeatedly(trigger.AfterCount(1)),
             accumulation_mode=AccumulationMode.ACCUMULATING)
         | 'Count' >> beam.combiners.Count.PerElement()
         | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                                (word_count[0], word_count[1]))
         | 'Log' >> beam.ParDo(LoggingDoFn()))
def run_pipeline():
    options = PipelineOptions(
        #      runner = "DirectRunner",
        runner="PortableRunner",
        job_endpoint="localhost:8099",
        environment_type="LOOPBACK")
    #print(options)

    # options = PipelineOptions([
    #     "--runner=PortableRunner",
    #     "--job_endpoint=localhost:8099",
    #     "--environment_type=LOOPBACK"
    # ])

    # beam_options = PipelineOptions(
    #   beam_args,
    #   runner='DataflowRunner',
    #   project='my-project-id',
    #   job_name='unique-job-name',
    #   temp_location='gs://my-bucket/temp',
    #   region='us-central1')

    #pipeline_options = PipelineOptions()
    #  with beam.Pipeline(options = options) as p:
    with beam.Pipeline() as p:
        (p
         #    | beam.Create(['alpha','beta', 'gamma'])
         | 'Read from Kafka' >> ReadFromKafka(
             consumer_config={
                 'bootstrap.servers': brokers,
                 'auto.offset.reset': 'latest',
                 'session.timeout.ms': '12000'
                 #                                ,'request.timeout.ms.config': 120000
             },
             topics=[kafka_topic])
         | 'Print' >> beam.Map(lambda x: print('*' * 100, '\n', x)))
Пример #6
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    (p | 'ReadFromKafka' >> ReadFromKafka(
        consumer_config={"bootstrap.servers": "localhost:9092"},
        topics=["beam-input"])
     | 'ExtractWords' >> beam.FlatMap(lambda
                                      (k, v): re.findall(r'[A-Za-z\']+', v))
     | 'Window' >> beam.WindowInto(
         window.GlobalWindows(),
         trigger=trigger.Repeatedly(trigger.AfterCount(1)),
         accumulation_mode=AccumulationMode.ACCUMULATING)
     | 'Count' >> beam.combiners.Count.PerElement()
     | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                            (word_count[0], word_count[1]))
     | 'Log' >> beam.ParDo(LoggingDoFn()))

    result = p.run()
    result.wait_until_finish()
Пример #7
0
def main(pipeline_options, args):

    farm_kw = (
        ('graphs', 1),
        ('jobs', 2),
        ('tasks', 3),
        ('outputs', 4),
    )

    pipe = beam.Pipeline(options=pipeline_options)

    feed = (pipe
            | 'KafkaInflow' >> ReadFromKafka(
                consumer_config={
                    'bootstrap.servers': 'localhost:9092',
                },
                topics=[TOPIC],
                key_deserializer=
                'org.apache.kafka.common.serialization.ByteArrayDeserializer',
                value_deserializer=
                'org.apache.kafka.common.serialization.ByteArrayDeserializer',
                expansion_service='localhost:8097')
            | 'RawFeed' >> Log(color=('white', ['dark'])))

    # (
    #     feed
    #     | JobAggregateLevel.TASK >> JobOutput(JobAggregateLevel.TASK)
    #     | 'PerTask' >> Log(color=('yellow', ['bold']))
    # )
    #
    # (
    #     feed
    #     | JobAggregateLevel.JOB >> JobOutput(JobAggregateLevel.JOB)
    #     | 'PerJob' >> Log(color=('blue', ['bold']))
    # )
    #
    # (
    #     feed
    #     | JobAggregateLevel.GRAPH >> JobOutput(JobAggregateLevel.GRAPH)
    #     | 'PerGraph' >> Log(color=('green', ['bold']))
    # )

    result = pipe.run()  # type: PipelineResult
    time.sleep(10)
    while result.state != PipelineState.RUNNING:
        time.sleep(10)

    print
    cprint('Starting streaming graph forever. Kill with ctrl+c',
           'red',
           attrs=['bold'])
    print

    cprint('Generating farm jobs:', 'yellow')
    for k, v in farm_kw:
        print '  {}={}'.format(k, colored(repr(v), 'white', attrs=['bold']))
    print

    admin = kafka.admin.KafkaAdminClient(
        # bootstrap_servers=['localhost:9092'],
    )
    try:
        admin.create_topics([kafka.admin.NewTopic(TOPIC, 1, 1)])
    except kafka.errors.TopicAlreadyExistsError:
        pass

    # producer = kafka.KafkaProducer(
    #     # bootstrap_servers=['localhost:9092'],
    # )
    #
    # for i, payload in enumerate(rillbeam.data.farm.gen_farm_messages(**dict(farm_kw))):
    #     print payload
    #     producer.send('beam-kfarm', 'foo')

    try:
        result.wait_until_finish()
    except KeyboardInterrupt:
        print
        cprint('Shutting down...', 'yellow')
        result.cancel()