def test_create_test_pipeline_options(self):
   test_pipeline = TestPipeline(argv=self.TEST_CASE['options'])
   test_options = PipelineOptions(test_pipeline.get_full_options_as_args())
   self.assertDictContainsSubset(self.TEST_CASE['expected_dict'],
                                 test_options.get_all_options())
 def test_validate_dataflow_job_file(self):
     runner = MockRunners.OtherRunner()
     options = PipelineOptions(['--dataflow_job_file', 'abc'])
     validator = PipelineOptionsValidator(options, runner)
     errors = validator.validate()
     self.assertFalse(errors)
Exemplo n.º 3
0
def _parse_args():
    """Parse the command line arguments."""
    parser = argparse.ArgumentParser()
    # TODO(user): consider to use "action=append"-style argparse flag.
    parser.add_argument(
        "--input_file_patterns",
        dest="input_file_patterns",
        required=True,
        help=(
            "The input data files or file patterns for batch prediction. Use "
            "%s to separate multiple files/patterns" %
            batch_prediction_pipeline.FILE_LIST_SEPARATOR))

    parser.add_argument("--input_file_format",
                        dest="input_file_format",
                        default="text",
                        choices=FILE_FORMAT_SUPPORTED,
                        help=("The input file format for batch prediction. "
                              "Supported format: %s" % FILE_FORMAT_SUPPORTED))

    parser.add_argument("--output_location",
                        dest="output_location",
                        required=True,
                        help="Output path to save the prediction results.")

    parser.add_argument(
        "--model_dir",
        dest="model_dir",
        required=True,
        help=("The path to the model where the tensorflow meta graph "
              "proto and checkpoint files are saved. Normally, it is "
              "the exported directory by session_bundle library."))

    parser.add_argument(
        "--batch_size",
        dest="batch_size",
        type=int,
        default=64,
        help=(
            "Number of records in one batch in the input data. All items in "
            "the same batch would be fed into tf session together thereby only "
            "one Session.Run() is invoked for one batch. If the batch_size "
            "has been embedded in the graph, the flag must match that value. "
            "If the first dim of the input tensors is None, this means any "
            "batch size value can be used. Thereby one can specify any int "
            "value to this flag. If no batch size is specified in the graph, "
            "the flag must take value of 1. Otherwise, the program will "
            "issue an error that shapes doesn't match."))

    parser.add_argument(
        "--user_project_id",
        dest="user_project_id",
        help="User's project id. It can be a different project from the one "
        "run the Dataflow job. The logs are sent to this project.")

    parser.add_argument(
        "--user_job_id",
        dest="user_job_id",
        help=(
            "User's CloudML job id. It is not the job id of the Dataflow job. "
            "The logs are sent to user job project with job id as its label."))

    known_args, pipeline_args = parser.parse_known_args(sys.argv[1:])
    pipeline_options = PipelineOptions(flags=pipeline_args)

    return known_args, pipeline_options
Exemplo n.º 4
0
    def __init__(self, runner=None, options=None, argv=None):
        """Initialize a pipeline object.

    Args:
      runner: An object of type 'PipelineRunner' that will be used to execute
        the pipeline. For registered runners, the runner name can be specified,
        otherwise a runner object must be supplied.
      options: A configured 'PipelineOptions' object containing arguments
        that should be used for running the Dataflow job.
      argv: a list of arguments (such as sys.argv) to be used for building a
        'PipelineOptions' object. This will only be used if argument 'options'
        is None.

    Raises:
      ValueError: if either the runner or options argument is not of the
      expected type.
    """

        if options is not None:
            if isinstance(options, PipelineOptions):
                self.options = options
            else:
                raise ValueError(
                    'Parameter options, if specified, must be of type PipelineOptions. '
                    'Received : %r', options)
        elif argv is not None:
            if isinstance(argv, list):
                self.options = PipelineOptions(argv)
            else:
                raise ValueError(
                    'Parameter argv, if specified, must be a list. Received : %r',
                    argv)
        else:
            self.options = PipelineOptions([])

        if runner is None:
            runner = self.options.view_as(StandardOptions).runner
            if runner is None:
                runner = StandardOptions.DEFAULT_RUNNER
                logging.info(
                    ('Missing pipeline option (runner). Executing pipeline '
                     'using the default runner: %s.'), runner)

        if isinstance(runner, str):
            runner = create_runner(runner)
        elif not isinstance(runner, PipelineRunner):
            raise TypeError('Runner must be a PipelineRunner object or the '
                            'name of a registered runner.')

        # Validate pipeline options
        errors = PipelineOptionsValidator(self.options, runner).validate()
        if errors:
            raise ValueError('Pipeline has validations errors: \n' +
                             '\n'.join(errors))

        # Default runner to be used.
        self.runner = runner
        # Stack of transforms generated by nested apply() calls. The stack will
        # contain a root node as an enclosing (parent) node for top transforms.
        self.transforms_stack = [AppliedPTransform(None, None, '', None)]
        # Set of transform labels (full labels) applied to the pipeline.
        # If a transform is applied and the full label is already in the set
        # then the transform will have to be cloned with a new label.
        self.applied_labels = set()
Exemplo n.º 5
0
    def test_dataflow_job_file(self):
        options = PipelineOptions(['--dataflow_job_file', 'abc'])
        self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc')

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
Exemplo n.º 6
0
    def test_template_location(self):
        options = PipelineOptions(['--template_location', 'abc'])
        self.assertEqual(options.get_all_options()['template_location'], 'abc')

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['template_location'], None)
Exemplo n.º 7
0
 def test_display_data(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         dd = DisplayData.create_from(options)
         hc.assert_that(dd.items,
                        hc.contains_inanyorder(*case['display_data']))
Exemplo n.º 8
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        # TODO: Should not subclass ParDo. Switch to PTransform as soon as
        # composite transforms support display data.
        class SpecialParDo(beam.ParDo):
            def __init__(self, fn, now):
                super(SpecialParDo, self).__init__(fn)
                self.fn = fn
                self.now = now

            # Make this a list to be accessible within closure
            def display_data(self):
                return {
                    'asubcomponent': self.fn,
                    'a_class': SpecialParDo,
                    'a_time': self.now
                }

        class SpecialDoFn(beam.NewDoFn):
            def display_data(self):
                return {'dofn_value': 42}

            def process(self):
                pass

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        remote_runner.job = apiclient.Job(p.options)
        super(DataflowRunner, remote_runner).run(p)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[0]
        disp_data = step['properties']['display_data']
        disp_data = sorted(disp_data, key=lambda x: x['namespace'] + x['key'])
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        expected_data = sorted(expected_data,
                               key=lambda x: x['namespace'] + x['key'])
        self.assertEqual(len(disp_data), 3)
        self.assertEqual(disp_data, expected_data)
Exemplo n.º 9
0
def model_custom_sink(simplekv, KVs, final_table_name_no_ptransform,
                      final_table_name_with_ptransform):
    """Demonstrates creating a new custom sink and using it in a pipeline.

  Defines a new sink ``SimpleKVSink`` that demonstrates writing to a simple
  key-value based storage system which has following API.

    simplekv.connect(url) -
        connects to the storage system and returns an access token which can be
        used to perform further operations
    simplekv.open_table(access_token, table_name) -
        creates a table named 'table_name'. Returns a table object.
    simplekv.write_to_table(access_token, table, key, value) -
        writes a key-value pair to the given table.
    simplekv.rename_table(access_token, old_name, new_name) -
        renames the table named 'old_name' to 'new_name'.

  Uses the new sink in an example pipeline.

  Additionally demonstrates how a sink should be implemented using a
  ``PTransform``. This is the recommended way to develop sinks that are to be
  distributed to a large number of end users.

  This method runs two pipelines.
  (1) A pipeline that uses ``SimpleKVSink`` directly using the ``df.Write``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``SimpleKVSink``.

  Args:
    simplekv: an object that mocks the key-value storage.
    KVs: the set of key-value pairs to be written in the example pipeline.
    final_table_name_no_ptransform: the prefix of final set of tables to be
                                    created by the example pipeline that uses
                                    ``SimpleKVSink`` directly.
    final_table_name_with_ptransform: the prefix of final set of tables to be
                                      created by the example pipeline that uses
                                      a ``PTransform`` that wraps
                                      ``SimpleKVSink``.
  """

    import apache_beam as beam
    from apache_beam.io import iobase
    from apache_beam.transforms.core import PTransform
    from apache_beam.utils.pipeline_options import PipelineOptions

    # Defining the new sink.
    # [START model_custom_sink_new_sink]
    class SimpleKVSink(iobase.Sink):
        def __init__(self, url, final_table_name):
            self._url = url
            self._final_table_name = final_table_name

        def initialize_write(self):
            access_token = simplekv.connect(self._url)
            return access_token

        def open_writer(self, access_token, uid):
            table_name = 'table' + uid
            return SimpleKVWriter(access_token, table_name)

        def finalize_write(self, access_token, table_names):
            for i, table_name in enumerate(table_names):
                simplekv.rename_table(access_token, table_name,
                                      self._final_table_name + str(i))

    # [END model_custom_sink_new_sink]

    # Defining a writer for the new sink.
    # [START model_custom_sink_new_writer]
    class SimpleKVWriter(iobase.Writer):
        def __init__(self, access_token, table_name):
            self._access_token = access_token
            self._table_name = table_name
            self._table = simplekv.open_table(access_token, table_name)

        def write(self, record):
            key, value = record

            simplekv.write_to_table(self._access_token, self._table, key,
                                    value)

        def close(self):
            return self._table_name

    # [END model_custom_sink_new_writer]

    final_table_name = final_table_name_no_ptransform

    # Using the new sink in an example pipeline.
    # [START model_custom_sink_use_new_sink]
    p = beam.Pipeline(options=PipelineOptions())
    kvs = p | beam.core.Create('CreateKVs', KVs)

    kvs | 'WriteToSimpleKV' >> beam.io.Write(
        SimpleKVSink('http://url_to_simple_kv/', final_table_name))
    # [END model_custom_sink_use_new_sink]

    p.run().wait_until_finish()

    # We recommend users to start Sink class names with an underscore to
    # discourage using the Sink class directly when a PTransform for the sink is
    # available. We simulate that here by simply extending the previous Sink
    # class.
    class _SimpleKVSink(SimpleKVSink):
        pass

    # [START model_custom_sink_new_ptransform]
    class WriteToKVSink(PTransform):
        def __init__(self, label, url, final_table_name, **kwargs):
            super(WriteToKVSink, self).__init__(label, **kwargs)
            self._url = url
            self._final_table_name = final_table_name

        def expand(self, pcoll):
            return pcoll | iobase.Write(
                _SimpleKVSink(self._url, self._final_table_name))

    # [END model_custom_sink_new_ptransform]

    final_table_name = final_table_name_with_ptransform

    # [START model_custom_sink_use_ptransform]
    p = beam.Pipeline(options=PipelineOptions())
    kvs = p | 'CreateKVs' >> beam.core.Create(KVs)
    kvs | WriteToKVSink('WriteToSimpleKV', 'http://url_to_simple_kv/',
                        final_table_name)
    # [END model_custom_sink_use_ptransform]

    p.run().wait_until_finish()
Exemplo n.º 10
0
def model_custom_source(count):
    """Demonstrates creating a new custom source and using it in a pipeline.

  Defines a new source ``CountingSource`` that produces integers starting from 0
  up to a given size.

  Uses the new source in an example pipeline.

  Additionally demonstrates how a source should be implemented using a
  ``PTransform``. This is the recommended way to develop sources that are to
  distributed to a large number of end users.

  This method runs two pipelines.
  (1) A pipeline that uses ``CountingSource`` directly using the ``df.Read``
      transform.
  (2) A pipeline that uses a custom ``PTransform`` that wraps
      ``CountingSource``.

  Args:
    count: the size of the counting source to be used in the pipeline
           demonstrated in this method.
  """

    import apache_beam as beam
    from apache_beam.io import iobase
    from apache_beam.io.range_trackers import OffsetRangeTracker
    from apache_beam.transforms.core import PTransform
    from apache_beam.utils.pipeline_options import PipelineOptions

    # Defining a new source.
    # [START model_custom_source_new_source]
    class CountingSource(iobase.BoundedSource):
        def __init__(self, count):
            self._count = count

        def estimate_size(self):
            return self._count

        def get_range_tracker(self, start_position, stop_position):
            if start_position is None:
                start_position = 0
            if stop_position is None:
                stop_position = self._count

            return OffsetRangeTracker(start_position, stop_position)

        def read(self, range_tracker):
            for i in range(self._count):
                if not range_tracker.try_claim(i):
                    return
                yield i

        def split(self,
                  desired_bundle_size,
                  start_position=None,
                  stop_position=None):
            if start_position is None:
                start_position = 0
            if stop_position is None:
                stop_position = self._count

            bundle_start = start_position
            while bundle_start < self._count:
                bundle_stop = max(self._count,
                                  bundle_start + desired_bundle_size)
                yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                          source=self,
                                          start_position=bundle_start,
                                          stop_position=bundle_stop)
                bundle_start = bundle_stop

    # [END model_custom_source_new_source]

    # Using the source in an example pipeline.
    # [START model_custom_source_use_new_source]
    p = beam.Pipeline(options=PipelineOptions())
    numbers = p | 'ProduceNumbers' >> beam.io.Read(CountingSource(count))
    # [END model_custom_source_use_new_source]

    lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
    beam.assert_that(
        lines,
        beam.equal_to(['line ' + str(number) for number in range(0, count)]))

    p.run().wait_until_finish()

    # We recommend users to start Source classes with an underscore to discourage
    # using the Source class directly when a PTransform for the source is
    # available. We simulate that here by simply extending the previous Source
    # class.
    class _CountingSource(CountingSource):
        pass

    # [START model_custom_source_new_ptransform]
    class ReadFromCountingSource(PTransform):
        def __init__(self, count, **kwargs):
            super(ReadFromCountingSource, self).__init__(**kwargs)
            self._count = count

        def expand(self, pcoll):
            return pcoll | iobase.Read(_CountingSource(count))

    # [END model_custom_source_new_ptransform]

    # [START model_custom_source_use_ptransform]
    p = beam.Pipeline(options=PipelineOptions())
    numbers = p | 'ProduceNumbers' >> ReadFromCountingSource(count)
    # [END model_custom_source_use_ptransform]

    lines = numbers | beam.core.Map(lambda number: 'line %d' % number)
    beam.assert_that(
        lines,
        beam.equal_to(['line ' + str(number) for number in range(0, count)]))

    p.run().wait_until_finish()
Exemplo n.º 11
0
 def test_no_staging_location(self):
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(PipelineOptions())
     self.assertEqual('The --staging_location option must be specified.',
                      cm.exception.message)