Exemplo n.º 1
0
def main(argv=None):
    options = PipelineOptions(argv)
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    input1 = p | 'Input1' >> beam.Create([1, 2, 3], reshuffle=False)
    input2 = p | 'Input2' >> beam.Create([4, 5, 6], reshuffle=False)

    output_a, output_b = (
        (input1, input2)
        | 'Flatten' >> beam.Flatten()
        | 'Split' >> beam.ParDo(MultiOutputDoFn()).with_outputs(
            MultiOutputDoFn.OUTPUT_TAG_B, main=MultiOutputDoFn.OUTPUT_TAG_A))

    # IdentityA and IdentityB are to set output types and set right coders for
    # Dataflow Runner. You may see type inference error (BEAM-4132) without them.

    (output_a
     | 'IdentityA' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int])
     | 'PrintA' >> beam.ParDo(StatefulPrintDoFn('PrintA')))

    (output_b
     | 'IdentityB' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int])
     | 'PrintB' >> beam.ParDo(StatefulPrintDoFn('PrintB')))

    p.run()
Exemplo n.º 2
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    (p
     | GenerateSequence(
         0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE)
     | Map(lambda x: logging.info(x)))

    p.run()
Exemplo n.º 3
0
def main(argv=None):
  options = PipelineOptions(argv)

  p = Pipeline(options=options)

  (p
   | Create(["a", "b", "c", "d", "e"], reshuffle=False)
   | Print("hello", expansion_service(options)))

  p.run()
Exemplo n.º 4
0
    def handle_return(self, pipeline: beam.Pipeline) -> None:
        """Appends a beam.io.WriteToParquet at the end of a beam pipeline
        and therefore persists the results.

        Args:
            pipeline: A beam.pipeline object.
        """
        # TODO [ENG-139]: Implement beam writing
        super().handle_return(pipeline)
        pipeline | beam.ParDo()
        pipeline.run()
Exemplo n.º 5
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False)
    output1 = input | 'Output1' >> beam.Map(lambda x, side:
                                            (x, side), AsList(input))
    input | 'Output2' >> beam.Map(
        lambda x, side: logging.info('x: %s, side: %s', x, side),
        AsList(output1))

    p.run()
Exemplo n.º 6
0
def main(argv=None):
    options = PipelineOptions(argv)
    kafka_options = options.view_as(KafkaReadOptions)

    p = Pipeline(options=options)
    (p
     | ReadFromKafka(consumer_config={
         'bootstrap.servers': kafka_options.bootstrap_servers
     },
                     topics=[kafka_options.topic])
     | Map(lambda x: logging.info('kafka element: %s', x)))

    p.run()
Exemplo n.º 7
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True
    project = options.view_as(GoogleCloudOptions).project

    p = Pipeline(options=options)
    (p
     | Create(EN_TEXTS)
     | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE,
                           TARGET_LANGUAGE_CODE))
     | Map(print_translation))

    p.run()
Exemplo n.º 8
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()
Exemplo n.º 9
0
def pipeline_options_remote(argv):
    """Creating a Pipeline using a PipelineOptions object for remote execution."""

    from apache_beam import Pipeline
    from apache_beam.options.pipeline_options import PipelineOptions

    # [START pipeline_options_create]
    options = PipelineOptions(flags=argv)

    # [END pipeline_options_create]

    # [START pipeline_options_define_custom]
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input')
            parser.add_argument('--output')

    # [END pipeline_options_define_custom]

    from apache_beam.options.pipeline_options import GoogleCloudOptions
    from apache_beam.options.pipeline_options import StandardOptions

    # [START pipeline_options_dataflow_service]
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=argv)

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'my-project-id'
    google_cloud_options.job_name = 'myjob'
    google_cloud_options.staging_location = 'gs://my-bucket/binaries'
    google_cloud_options.temp_location = 'gs://my-bucket/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    # [END pipeline_options_dataflow_service]

    my_options = options.view_as(MyOptions)
    my_input = my_options.input
    my_output = my_options.output

    p = TestPipeline()  # Use TestPipeline for testing.

    lines = p | beam.io.ReadFromText(my_input)
    lines | beam.io.WriteToText(my_output)

    p.run()
Exemplo n.º 10
0
def pipeline_options_remote(argv):
  """Creating a Pipeline using a PipelineOptions object for remote execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from apache_beam.options.pipeline_options import GoogleCloudOptions
  from apache_beam.options.pipeline_options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  p = TestPipeline()  # Use TestPipeline for testing.

  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)

  p.run()
Exemplo n.º 11
0
    def test_job_python_from_python_it(self):
        @ptransform.PTransform.register_urn('simple', None)
        class SimpleTransform(ptransform.PTransform):
            def expand(self, pcoll):
                return pcoll | beam.Map(lambda x: 'Simple(%s)' % x)

            def to_runner_api_parameter(self, unused_context):
                return 'simple', None

            @staticmethod
            def from_runner_api_parameter(_0, _1, _2):
                return SimpleTransform()

        pipeline = TestPipeline(is_integration_test=True)

        res = (pipeline
               | beam.Create(['a', 'b'])
               | beam.ExternalTransform(
                   'simple', None,
                   expansion_service.ExpansionServiceServicer()))
        assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))

        proto_pipeline, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto_pipeline,
                                                       pipeline.runner,
                                                       pipeline._options)
        pipeline_from_proto.run().wait_until_finish()
def main():
    # bq_source = BigQuerySource(query="""
    #                            SELECT created_at, text
    #                            FROM got_sentiment.got_tweets
    #                            """,
    #                            validate=False, coder=None,
    #                            use_standard_sql=True, flatten_results=True,
    #                            kms_key=None)

    # Removed attributes from ReadFromPubSub:
    #                              with_attributes=False,
    #                             timestamp_attribute='created_at'

    # Create the Pipeline with the specified options.
    with Pipeline(options=options) as p:
        results = (
            p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC)
            | 'Window' >> WindowInto(window.FixedWindows(60))
            | 'Emit_needed_values' >> FlatMap(emit_values, entity_map)
            | 'Combine' >> CombinePerKey(EntityScoreCombine())
            | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn())
            | 'FormatForWrite' >> Map(format_for_write)
            | 'Write' >> WriteToBigQuery('streaming_scores',
                                         dataset=BQ_DATASET,
                                         project=PROJECT_ID,
                                         create_disposition='CREATE_IF_NEEDED',
                                         write_disposition='WRITE_APPEND',
                                         batch_size=20))
Exemplo n.º 13
0
def pipeline_options_local(argv):
  """Creating a Pipeline using a PipelineOptions object for local execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  with Pipeline(options=options) as p:
    # [END pipeline_options_local]

    with TestPipeline() as p:  # Use TestPipeline for testing.
      lines = p | beam.io.ReadFromText(my_input)
      lines | beam.io.WriteToText(my_output)
Exemplo n.º 14
0
    def test_pipeline_generation(self):
        @ptransform.PTransform.register_urn('simple', None)
        class SimpleTransform(ptransform.PTransform):
            def expand(self, pcoll):
                return pcoll | 'TestLabel' >> beam.Map(
                    lambda x: 'Simple(%s)' % x)

            def to_runner_api_parameter(self, unused_context):
                return 'simple', None

            @staticmethod
            def from_runner_api_parameter(unused_parameter, unused_context):
                return SimpleTransform()

        pipeline = beam.Pipeline()
        res = (pipeline
               | beam.Create(['a', 'b'])
               | beam.ExternalTransform(
                   'simple', None,
                   expansion_service.ExpansionServiceServicer()))
        assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))

        proto, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner,
                                                       pipeline._options)

        # Original pipeline has the un-expanded external transform
        self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts)

        # new pipeline has the expanded external transform
        self.assertNotEqual(
            [], pipeline_from_proto.transforms_stack[0].parts[1].parts)
        self.assertEqual(
            u'ExternalTransform(simple)/TestLabel', pipeline_from_proto.
            transforms_stack[0].parts[1].parts[0].full_label)
Exemplo n.º 15
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)

    start = 1
    end = 100
    (p
     |
     'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1)))
     | 'Sum' >> CombineGlobally(sum)
     | 'Print' >>
     ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total)))

    p.run()
Exemplo n.º 16
0
def run(argv=None):
  options = PipelineOptions(argv)
  options.view_as(SetupOptions).save_main_session = True

  with Pipeline(options=options) as p:
    (p
     | beam.Create([None])
     | beam.ParDo(connect_and_query))
Exemplo n.º 17
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    project = options.view_as(GoogleCloudOptions).project
    assert project is not None, '"project" is not specified.'

    source_code = 'en-US'
    target_code = 'ja'
    texts = ['Hello', 'Thank you', 'Goodbye']

    p = Pipeline(options=options)
    (p
     | 'Texts' >> Create(texts)
     | 'Translate' >> ParDo(Translate(project, source_code, target_code))
     | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1])))

    p.run()
Exemplo n.º 18
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()
Exemplo n.º 19
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True
  p = Pipeline(options=options)

  start = 1
  end = 10

  (p
   | 'From {} to {}'.format(start, end)
   >> Create(list(range(start, end + 1)))
   | 'ToXml' >> ParDo(ToXmlDoFn())
   # If a job finishes too quickly, worker VMs can be shutdown before they send
   # logs in local files to Cloud Logging. Adding 30s sleep to avoid this
   | 'Sleep30s' >> ParDo(Sleep(30))
   | 'Print' >> ParDo(lambda xml: logging.info(xml))
   )

  p.run()
Exemplo n.º 20
0
def main(argv=None):
    options = PipelineOptions(argv)
    topic = options.view_as(PubSubTopicOptions).topic

    p = Pipeline(options=options)
    (p
     # This is an external transform
     # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from
     # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used
     # for most cases.
     #
     # If you set expansion_service as BeamJarExpansionService(
     # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will
     # fail as the beam jar has no dependency for DirectRunner. As a workaround,
     # specify custom expansion service jar in this project.
     | ReadFromPubSub(topic=topic,
                      with_attributes=True,
                      expansion_service=expansion_service(options))
     | Map(lambda message: logging.info("message: %s", message)))
    p.run()
Exemplo n.º 21
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    opt = options.view_as(_Options)
    inputs = opt.inputs
    output_prefix = opt.output_prefix or os.path.join(
        options.view_as(GoogleCloudOptions).temp_location, 'output')
    shards = opt.shards

    p = Pipeline(options=options)

    def generate(n):
        yield from range(n * _ELEMENTS_PER_INPUT,
                         (n + 1) * _ELEMENTS_PER_INPUT)

    (p
     | Create(range(inputs))
     | ParDo(generate).with_output_types(int)
     | WriteToText(output_prefix, num_shards=shards))

    p.run()
Exemplo n.º 22
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('input_topic',
                        type=str,
                        help="Input Pub/Sub topic name.")
    parser.add_argument(
        'output_table',
        type=str,
        help="Output BigQuery table name. Example: project.db.name")
    parser.add_argument('--model_project',
                        type=str,
                        help="Google Project ID with model.")
    parser.add_argument('--model_name',
                        type=str,
                        help="Name of the Google AI Platform model name.")
    parser.add_argument('--model_region',
                        type=str,
                        help="AI Platform region name.")
    parser.add_argument('--model_version',
                        type=str,
                        help="AI Platform model version.")

    known_args, pipeline_args = parser.parse_known_args(argv)

    _topic_comp = known_args.input_topic.split('/')
    if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[
            2] != 'topics':
        raise ValueError("Table topic name has inappropriate format.")

    if len(known_args.output_table.split('.')) != 2:
        raise ValueError("Table name has inappropriate format.")

    inf_args = [
        known_args.model_project, known_args.model_name,
        known_args.model_region, known_args.model_version
    ]
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = Pipeline(options=options)
    _ = (p | 'read from pub/sub' >> ReadFromPubSub(
        known_args.input_topic).with_output_types(bytes)
         | 'windowing' >> WindowInto(window.FixedWindows(10, 0))
         | 'convert to dict' >> Map(json.loads)
         | 'pre processing' >> PreProcessing()
         | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args))
         | 'format message' >> Map(formatter)
         | 'write to BQ' >> WriteToBigQuery(
             table=known_args.output_table,
             schema=build_bq_schema(),
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_APPEND))
    if os.environ.get('DEPLOY'):
        p.run(
        )  # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running.
    else:
        p.run().wait_until_finish()
Exemplo n.º 23
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
Exemplo n.º 24
0
def pipeline_options_local(argv):
  """Creating a Pipeline using a PipelineOptions object for local execution."""

  from apache_beam import Pipeline
  from apache_beam.options.pipeline_options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  p = Pipeline(options=options)
  # [END pipeline_options_local]

  p = TestPipeline()  # Use TestPipeline for testing.
  lines = p | beam.io.ReadFromText(my_input)
  lines | beam.io.WriteToText(my_output)
  p.run()
Exemplo n.º 25
0
def run(input_topic, num_shards, window_size):

    # Set `save_main_session` to True so DoFns can access globally imported modules.
    pipeline_options = PipelineOptions(pipeline_args,
                                       streaming=True,
                                       save_main_session=True)

    custom_options = pipeline_options.view_as(CustomPipelineOptions)

    with Pipeline(options=custom_options) as pipeline:
        (pipeline
         # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
         # binds the publish time returned by the Pub/Sub server for each message
         # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
         # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
         | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
         |
         "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards)
         | "Write to GCS" >> ParDo(WriteToGCS(custom_options.output_path)))
Exemplo n.º 26
0
    def test_pipeline_generation(self):
        pipeline = beam.Pipeline()
        res = (pipeline
               | beam.Create(['a', 'b'])
               | beam.ExternalTransform(
                   'simple', None,
                   expansion_service.ExpansionServiceServicer()))
        assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))

        proto, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner,
                                                       pipeline._options)

        # Original pipeline has the un-expanded external transform
        self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts)

        # new pipeline has the expanded external transform
        self.assertNotEqual(
            [], pipeline_from_proto.transforms_stack[0].parts[1].parts)
        self.assertEqual(
            u'ExternalTransform(simple)/TestLabel', pipeline_from_proto.
            transforms_stack[0].parts[1].parts[0].full_label)
Exemplo n.º 27
0
def tensorize_sql_fields(pipeline: Pipeline, output_path: str,
                         sql_dataset: str, tensor_type: str):

    if tensor_type == 'categorical':
        query = _get_categorical_query(sql_dataset)
    elif tensor_type == 'continuous':
        query = _get_continuous_query(sql_dataset)
    elif tensor_type == 'icd':
        query = _get_icd_query(sql_dataset)
    elif tensor_type == 'disease':
        query = _get_disease_query(sql_dataset)
    elif tensor_type == 'phecode_disease':
        query = _get_phecode_query(sql_dataset)
    elif tensor_type == 'death':
        query = _get_death_and_censor_query(sql_dataset)
    else:
        raise ValueError(
            "Can tensorize only categorical or continuous fields, got ",
            tensor_type)

    bigquery_source = beam.io.BigQuerySource(query=query,
                                             use_standard_sql=True)
    # Query table in BQ
    steps = (
        pipeline
        | 'QueryTables' >> beam.io.Read(bigquery_source)

        # Each row is a dictionary where the keys are the BigQuery columns
        | 'CreateKey' >> beam.Map(lambda row: (row['sample_id'], row))

        # Group by key
        | 'GroupByKey' >> beam.GroupByKey()

        # Format into hd5 files and upload to GCS
        | 'CreateHd5sAndUploadToGCS' >> beam.Map(write_tensor_from_sql,
                                                 output_path, tensor_type))

    result = pipeline.run()
    result.wait_until_finish()
Exemplo n.º 28
0
    def test_pipeline_generation(self):
        pipeline = beam.Pipeline()
        _ = (pipeline
             | beam.Create(['a', 'b'])
             | beam.ExternalTransform(
                 'beam:transforms:xlang:test:prefix',
                 ImplicitSchemaPayloadBuilder({'data': u'0'}),
                 expansion_service.ExpansionServiceServicer()))

        proto, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner,
                                                       pipeline._options)

        # Original pipeline has the un-expanded external transform
        self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts)

        # new pipeline has the expanded external transform
        self.assertNotEqual(
            [], pipeline_from_proto.transforms_stack[0].parts[1].parts)
        self.assertEqual(
            u'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel',
            pipeline_from_proto.transforms_stack[0].parts[1].parts[0].
            full_label)
Exemplo n.º 29
0
    def test_metrics(self):
      """Run a simple DoFn that increments a counter, and verify that its
       expected value is written to a temporary file by the FileReporter"""

      counter_name = 'elem_counter'

      class DoFn(beam.DoFn):
        def __init__(self):
          self.counter = Metrics.counter(self.__class__, counter_name)
          logging.info('counter: %s' % self.counter.metric_name)

        def process(self, v):
          self.counter.inc()

      options = self.create_options()
      # Test only supports parallelism of 1
      options._all_options['parallelism'] = 1
      n = 100
      with Pipeline(self.get_runner(), options) as p:
        # pylint: disable=expression-not-assigned
        (p
         | beam.Create(list(range(n)))
         | beam.ParDo(DoFn()))

      with open(self.test_metrics_path, 'r') as f:
        lines = [line for line in f.readlines() if counter_name in line]
        self.assertEqual(
            len(lines), 1,
            msg='Expected 1 line matching "{}":\n{}'.format(
                counter_name, '\n'.join(lines))
        )
        line = lines[0]
        self.assertTrue(
            '{}: {}'.format(counter_name in line, n),
            msg='Failed to find expected counter {} in line {}'.format(
                counter_name, line)
        )
Exemplo n.º 30
0
  def test_pipeline_generation(self):
    pipeline = beam.Pipeline()
    res = (pipeline
           | beam.Create(['a', 'b'])
           | beam.ExternalTransform(
               'simple',
               None,
               expansion_service.ExpansionServiceServicer()))
    assert_that(res, equal_to(['Simple(a)', 'Simple(b)']))

    proto, _ = pipeline.to_runner_api(
        return_context=True)
    pipeline_from_proto = Pipeline.from_runner_api(
        proto, pipeline.runner, pipeline._options)

    # Original pipeline has the un-expanded external transform
    self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts)

    # new pipeline has the expanded external transform
    self.assertNotEqual(
        [], pipeline_from_proto.transforms_stack[0].parts[1].parts)
    self.assertEqual(
        u'ExternalTransform(simple)/TestLabel',
        pipeline_from_proto.transforms_stack[0].parts[1].parts[0].full_label)
Exemplo n.º 31
0
        def test_metrics(self):
            """Run a simple DoFn that increments a counter and verifies state
      caching metrics. Verifies that its expected value is written to a
      temporary file by the FileReporter"""

            counter_name = 'elem_counter'
            state_spec = userstate.BagStateSpec('state', VarIntCoder())

            class DoFn(beam.DoFn):
                def __init__(self):
                    self.counter = Metrics.counter(self.__class__,
                                                   counter_name)
                    _LOGGER.info('counter: %s' % self.counter.metric_name)

                def process(self, kv, state=beam.DoFn.StateParam(state_spec)):
                    # Trigger materialization
                    list(state.read())
                    state.add(1)
                    self.counter.inc()

            options = self.create_options()
            # Test only supports parallelism of 1
            options._all_options['parallelism'] = 1
            # Create multiple bundles to test cache metrics
            options._all_options['max_bundle_size'] = 10
            options._all_options['max_bundle_time_millis'] = 95130590130
            experiments = options.view_as(DebugOptions).experiments or []
            experiments.append('state_cache_size=123')
            options.view_as(DebugOptions).experiments = experiments
            with Pipeline(self.get_runner(), options) as p:
                # pylint: disable=expression-not-assigned
                (p
                 | "create" >> beam.Create(list(range(0, 110)))
                 | "mapper" >> beam.Map(lambda x: (x % 10, 'val'))
                 | "stateful" >> beam.ParDo(DoFn()))

            lines_expected = {'counter: 110'}
            if streaming:
                lines_expected.update([
                    # Gauges for the last finished bundle
                    'stateful.beam.metric:statecache:capacity: 123',
                    # These are off by 10 because the first bundle contains all the keys
                    # once. Caching is only initialized after the first bundle. Caching
                    # depends on the cache token which is lazily initialized by the
                    # Runner's StateRequestHandlers.
                    'stateful.beam.metric:statecache:size: 10',
                    'stateful.beam.metric:statecache:get: 10',
                    'stateful.beam.metric:statecache:miss: 0',
                    'stateful.beam.metric:statecache:hit: 10',
                    'stateful.beam.metric:statecache:put: 0',
                    'stateful.beam.metric:statecache:extend: 10',
                    'stateful.beam.metric:statecache:evict: 0',
                    # Counters
                    # (total of get/hit will be off by 10 due to the caching
                    # only getting initialized after the first bundle.
                    # Caching depends on the cache token which is lazily
                    # initialized by the Runner's StateRequestHandlers).
                    'stateful.beam.metric:statecache:get_total: 100',
                    'stateful.beam.metric:statecache:miss_total: 10',
                    'stateful.beam.metric:statecache:hit_total: 90',
                    'stateful.beam.metric:statecache:put_total: 10',
                    'stateful.beam.metric:statecache:extend_total: 100',
                    'stateful.beam.metric:statecache:evict_total: 0',
                ])
            else:
                # Batch has a different processing model. All values for
                # a key are processed at once.
                lines_expected.update([
                    # Gauges
                    'stateful).beam.metric:statecache:capacity: 123',
                    # For the first key, the cache token will not be set yet.
                    # It's lazily initialized after first access in StateRequestHandlers
                    'stateful).beam.metric:statecache:size: 9',
                    # We have 11 here because there are 110 / 10 elements per key
                    'stateful).beam.metric:statecache:get: 11',
                    'stateful).beam.metric:statecache:miss: 1',
                    'stateful).beam.metric:statecache:hit: 10',
                    # State is flushed back once per key
                    'stateful).beam.metric:statecache:put: 1',
                    'stateful).beam.metric:statecache:extend: 1',
                    'stateful).beam.metric:statecache:evict: 0',
                    # Counters
                    'stateful).beam.metric:statecache:get_total: 99',
                    'stateful).beam.metric:statecache:miss_total: 9',
                    'stateful).beam.metric:statecache:hit_total: 90',
                    'stateful).beam.metric:statecache:put_total: 9',
                    'stateful).beam.metric:statecache:extend_total: 9',
                    'stateful).beam.metric:statecache:evict_total: 0',
                ])
            lines_actual = set()
            with open(self.test_metrics_path, 'r') as f:
                line = f.readline()
                while line:
                    for metric_str in lines_expected:
                        if metric_str in line:
                            lines_actual.add(metric_str)
                    line = f.readline()
            self.assertSetEqual(lines_actual, lines_expected)
Exemplo n.º 32
0
 def _create_pipeline(self, options):
     return Pipeline(options=options)
Exemplo n.º 33
0
def run(argv=None):
    options = PipelineOptions(argv)
    options.view_as(SetupOptions).save_main_session = True
    with Pipeline(options=options) as p:
        (p | beam.Impulse() | beam.ParDo(MysqlDoFn()))