Пример #1
0
def pipeline_options_remote(argv):
  """"Creating a Pipeline using a PipelineOptions object for remote execution.

  URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
  """

  from google.cloud.dataflow import Pipeline
  from google.cloud.dataflow.utils.options import PipelineOptions

  # [START pipeline_options_create]
  options = PipelineOptions(flags=argv)
  # [END pipeline_options_create]

  # [START pipeline_options_define_custom]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input')
      parser.add_argument('--output')
  # [END pipeline_options_define_custom]

  from google.cloud.dataflow.utils.options import GoogleCloudOptions
  from google.cloud.dataflow.utils.options import StandardOptions

  # [START pipeline_options_dataflow_service]
  # Create and set your PipelineOptions.
  options = PipelineOptions(flags=argv)

  # For Cloud execution, set the Cloud Platform project, job_name,
  # staging location, temp_location and specify DataflowPipelineRunner or
  # BlockingDataflowPipelineRunner.
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://my-bucket/binaries'
  google_cloud_options.temp_location = 'gs://my-bucket/temp'
  options.view_as(StandardOptions).runner = 'DataflowPipelineRunner'

  # Create the Pipeline with the specified options.
  p = Pipeline(options=options)
  # [END pipeline_options_dataflow_service]

  my_options = options.view_as(MyOptions)
  my_input = my_options.input
  my_output = my_options.output

  # Overriding the runner for tests.
  options.view_as(StandardOptions).runner = 'DirectPipelineRunner'
  p = Pipeline(options=options)

  lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input))
  lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output))

  p.run()
Пример #2
0
def pipeline_options_local(argv):
  """"Creating a Pipeline using a PipelineOptions object for local execution.

  URL: https://cloud.google.com/dataflow/pipelines/specifying-exec-params
  """

  from google.cloud.dataflow import Pipeline
  from google.cloud.dataflow.utils.options import PipelineOptions

  options = PipelineOptions(flags=argv)

  # [START pipeline_options_define_custom_with_help_and_default]
  class MyOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the dataflow pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='Output for the dataflow pipeline',
                          default='gs://my-bucket/output')
  # [END pipeline_options_define_custom_with_help_and_default]

  my_options = options.view_as(MyOptions)

  my_input = my_options.input
  my_output = my_options.output

  # [START pipeline_options_local]
  # Create and set your Pipeline Options.
  options = PipelineOptions()
  p = Pipeline(options=options)
  # [END pipeline_options_local]

  lines = p | df.io.Read('ReadFromText', df.io.TextFileSource(my_input))
  lines | df.io.Write('WriteToText', df.io.TextFileSink(my_output))
  p.run()