Пример #1
0
 def test_serialization(self):
     context = pipeline_context.PipelineContext()
     float_coder_ref = context.coders.get_id(coders.FloatCoder())
     bytes_coder_ref = context.coders.get_id(coders.BytesCoder())
     proto = context.to_runner_api()
     context2 = pipeline_context.PipelineContext.from_runner_api(proto)
     self.assertEqual(coders.FloatCoder(),
                      context2.coders.get_by_id(float_coder_ref))
     self.assertEqual(coders.BytesCoder(),
                      context2.coders.get_by_id(bytes_coder_ref))
Пример #2
0
  def test_common_id_assignment(self):
    context = pipeline_context.PipelineContext()
    float_coder_ref = context.coders.get_id(coders.FloatCoder())
    bytes_coder_ref = context.coders.get_id(coders.BytesCoder())
    context2 = pipeline_context.PipelineContext(
        component_id_map=context.component_id_map)

    bytes_coder_ref2 = context2.coders.get_id(coders.BytesCoder())
    float_coder_ref2 = context2.coders.get_id(coders.FloatCoder())

    self.assertEqual(bytes_coder_ref, bytes_coder_ref2)
    self.assertEqual(float_coder_ref, float_coder_ref2)
Пример #3
0
    def test_str_utf8_coder(self):
        real_coder = coders.registry.get_coder(str)
        expected_coder = coders.BytesCoder()
        self.assertEqual(real_coder.encode('abc'),
                         expected_coder.encode('abc'))
        self.assertEqual('abc', real_coder.decode(real_coder.encode('abc')))

        real_coder = coders.registry.get_coder(bytes)
        expected_coder = coders.BytesCoder()
        self.assertEqual(real_coder.encode('abc'),
                         expected_coder.encode('abc'))
        self.assertEqual('abc', real_coder.decode(real_coder.encode('abc')))
Пример #4
0
 def __init__(self, topic=None, subscription=None, id_label=None):
     # we are using this coder explicitly for portability reasons of PubsubIO
     # across implementations in languages.
     self.coder = coders.BytesCoder()
     self.topic = topic
     self.subscription = subscription
     self.id_label = id_label
Пример #5
0
    def __init__(self, topic):
        # we are using this coder explicitly for portability reasons of PubsubIO
        # across implementations in languages.
        self.coder = coders.BytesCoder()
        self.full_topic = topic

        self.project, self.topic_name = parse_topic(topic)
Пример #6
0
    def __init__(self,
                 coder=coders.BytesCoder(),
                 compression_type=CompressionTypes.AUTO,
                 with_filename=False):
        """Initialize the ``ReadAllFromTFRecord`` transform.

    Args:
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      with_filename: If True, returns a Key Value with the key being the file
        name and the value being the actual data. If False, it only returns
        the data.
    """
        super(ReadAllFromTFRecord, self).__init__()
        source_from_file = partial(_create_tfrecordio_source,
                                   compression_type=compression_type,
                                   coder=coder)
        # Desired and min bundle sizes do not matter since TFRecord files are
        # unsplittable.
        self._read_all_files = ReadAllFiles(splittable=False,
                                            compression_type=compression_type,
                                            desired_bundle_size=0,
                                            min_bundle_size=0,
                                            source_from_file=source_from_file,
                                            with_filename=with_filename)
Пример #7
0
def get_coder_from_spec(coder_spec):
  """Return a coder instance from a coder spec.

  Args:
    coder_spec: A dict where the value of the '@type' key is a pickled instance
      of a Coder instance.

  Returns:
    A coder instance (has encode/decode methods).
  """
  assert coder_spec is not None

  # Ignore the wrappers in these encodings.
  ignored_wrappers = (
      'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder')
  if coder_spec['@type'] in ignored_wrappers:
    assert len(coder_spec['component_encodings']) == 1
    coder_spec = coder_spec['component_encodings'][0]
    return get_coder_from_spec(coder_spec)

  # Handle a few well known types of coders.
  if coder_spec['@type'] == 'kind:pair':
    assert len(coder_spec['component_encodings']) == 2
    component_coders = [
        get_coder_from_spec(c) for c in coder_spec['component_encodings']]
    return coders.TupleCoder(component_coders)
  elif coder_spec['@type'] == 'kind:stream':
    assert len(coder_spec['component_encodings']) == 1
    return coders.IterableCoder(
        get_coder_from_spec(coder_spec['component_encodings'][0]))
  elif coder_spec['@type'] == 'kind:windowed_value':
    assert len(coder_spec['component_encodings']) == 2
    value_coder, window_coder = [
        get_coder_from_spec(c) for c in coder_spec['component_encodings']]
    return coders.coders.WindowedValueCoder(
        value_coder, window_coder=window_coder)
  elif coder_spec['@type'] == 'kind:interval_window':
    assert ('component_encodings' not in coder_spec
            or not coder_spec['component_encodings'])
    return coders.coders.IntervalWindowCoder()
  elif coder_spec['@type'] == 'kind:global_window':
    assert ('component_encodings' not in coder_spec
            or not coder_spec['component_encodings'])
    return coders.coders.GlobalWindowCoder()
  elif coder_spec['@type'] == 'kind:varint':
    assert ('component_encodings' not in coder_spec
            or len(coder_spec['component_encodings'] == 0))
    return coders.coders.VarIntCoder()
  elif coder_spec['@type'] == 'kind:length_prefix':
    assert len(coder_spec['component_encodings']) == 1
    return coders.coders.LengthPrefixCoder(
        get_coder_from_spec(coder_spec['component_encodings'][0]))
  elif coder_spec['@type'] == 'kind:bytes':
    assert ('component_encodings' not in coder_spec
            or len(coder_spec['component_encodings'] == 0))
    return coders.BytesCoder()

  # We pass coders in the form "<coder_name>$<pickled_data>" to make the job
  # description JSON more readable.
  return coders.coders.deserialize_coder(coder_spec['@type'])
Пример #8
0
  def __init__(self,
               file_path_prefix,
               coder=coders.BytesCoder(),
               file_name_suffix='',
               num_shards=0,
               shard_name_template=None,
               compression_type=CompressionTypes.AUTO):
    """Initialize WriteToTFRecord transform.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix.
      coder: Coder used to encode each record.
      file_name_suffix: Suffix for the files written.
      num_shards: The number of files (shards) used for output. If not set, the
        default value will be used.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. When constructing a filename for a
        particular shard number, the upper-case letters 'S' and 'N' are
        replaced with the 0-padded shard number and shard count respectively.
        This argument can be '' in which case it behaves as if num_shards was
        set to 1 and only one file will be generated. The default pattern used
        is '-SSSSS-of-NNNNN' if None is passed as the shard_name_template.
      compression_type: Used to handle compressed output files. Typical value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.

    Returns:
      A WriteToTFRecord transform object.
    """
    super(WriteToTFRecord, self).__init__()
    self._sink = _TFRecordSink(file_path_prefix, coder, file_name_suffix,
                               num_shards, shard_name_template,
                               compression_type)
 def test_deduplication_by_proto(self):
   context = pipeline_context.PipelineContext()
   bytes_coder_proto = coders.BytesCoder().to_runner_api(None)
   bytes_coder_ref = context.coders.get_by_proto(bytes_coder_proto)
   bytes_coder_ref2 = context.coders.get_by_proto(
       bytes_coder_proto, deduplicate=True)
   self.assertEqual(bytes_coder_ref, bytes_coder_ref2)
Пример #10
0
    def __init__(self,
                 topic=None,
                 subscription=None,
                 id_label=None,
                 with_attributes=False,
                 timestamp_attribute=None):
        # We are using this coder explicitly for portability reasons of PubsubIO
        # across implementations in languages.
        self.coder = coders.BytesCoder()
        self.full_topic = topic
        self.full_subscription = subscription
        self.topic_name = None
        self.subscription_name = None
        self.id_label = id_label
        self.with_attributes = with_attributes
        self.timestamp_attribute = timestamp_attribute

        # Perform some validation on the topic and subscription.
        if not (topic or subscription):
            raise ValueError(
                'Either a topic or subscription must be provided.')
        if topic and subscription:
            raise ValueError(
                'Only one of topic or subscription should be provided.')

        if topic:
            self.project, self.topic_name = parse_topic(topic)
        if subscription:
            self.project, self.subscription_name = parse_subscription(
                subscription)
Пример #11
0
    def __init__(self,
                 file_pattern,
                 coder=coders.BytesCoder(),
                 compression_type=CompressionTypes.AUTO,
                 validate=True,
                 **kwargs):
        """Initialize a ReadFromTFRecord transform.

    Args:
      file_pattern: A file glob pattern to read TFRecords from.
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      validate: Boolean flag to verify that the files exist during the pipeline
          creation time.
      **kwargs: optional args dictionary. These are passed through to parent
        constructor.

    Returns:
      A ReadFromTFRecord transform object.
    """
        super(ReadFromTFRecord, self).__init__(**kwargs)
        self._source = _TFRecordSource(file_pattern, coder, compression_type,
                                       validate)
Пример #12
0
    def __init__(
            self,
            topic=None,  # type: Optional[str]
            subscription=None,  # type: Optional[str]
            id_label=None,  # type: Optional[str]
            with_attributes=False,  # type: bool
            timestamp_attribute=None  # type: Optional[str]
    ):
        self.coder = coders.BytesCoder()
        self.full_topic = topic
        self.full_subscription = subscription
        self.topic_name = None
        self.subscription_name = None
        self.id_label = id_label
        self.with_attributes = with_attributes
        self.timestamp_attribute = timestamp_attribute

        # Perform some validation on the topic and subscription.
        if not (topic or subscription):
            raise ValueError(
                'Either a topic or subscription must be provided.')
        if topic and subscription:
            raise ValueError(
                'Only one of topic or subscription should be provided.')

        if topic:
            self.project, self.topic_name = parse_topic(topic)
        if subscription:
            self.project, self.subscription_name = parse_subscription(
                subscription)
Пример #13
0
 def __init__(self, components, known_runner_urns, use_state_iterables=False):
   self.components = components
   self.known_runner_urns = known_runner_urns
   self.use_state_iterables = use_state_iterables
   self.safe_coders = {}
   self.bytes_coder_id = self.add_or_get_coder_id(
       coders.BytesCoder().to_runner_api(None), 'bytes_coder')
Пример #14
0
    def __init__(self,
                 coder=coders.BytesCoder(),
                 compression_type=CompressionTypes.AUTO,
                 **kwargs):
        """Initialize the ``ReadAllFromTFRecord`` transform.

    Args:
      coder: Coder used to decode each record.
      compression_type: Used to handle compressed input files. Default value
          is CompressionTypes.AUTO, in which case the file_path's extension will
          be used to detect the compression.
      **kwargs: optional args dictionary. These are passed through to parent
        constructor.
    """
        super(ReadAllFromTFRecord, self).__init__(**kwargs)
        source_from_file = partial(_create_tfrcordio_source,
                                   compression_type=compression_type,
                                   coder=coder)
        # Desired and min bundle sizes do not matter since TFRecord files are
        # unsplittable.
        self._read_all_files = ReadAllFiles(splittable=False,
                                            compression_type=compression_type,
                                            desired_bundle_size=0,
                                            min_bundle_size=0,
                                            source_from_file=source_from_file)
Пример #15
0
  def __init__(self, topic, id_label, with_attributes, timestamp_attribute):
    self.coder = coders.BytesCoder()
    self.full_topic = topic
    self.id_label = id_label
    self.with_attributes = with_attributes
    self.timestamp_attribute = timestamp_attribute

    self.project, self.topic_name = parse_topic(topic)
Пример #16
0
def run(argv=None):
    """
    Main entry point, define and run the pipeline
    """
    parser = argparse.ArgumentParser(
        description='Run Apache Beam to process the logs')
    parser.add_argument('--input', dest='input', help='Input file to process')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to')
    parser.add_argument(
        '--input_subscription',
        dest='input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    parser.add_argument(
        '--output_table',
        dest='output_table',
        help=('BigQuery Table to write results to, with the form '
              '<PROJECT>:<DATASET>.<TABLE>'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    print('pipeline options:', pipeline_options)

    # Specification for table in BigQuery
    table_spec = args.output_table
    table_schema = 'host:STRING, utc_timestamp:TIMESTAMP, action:STRING, uri:STRING, protocol:STRING, status:STRING, size:INTEGER'

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        if known_args.input_subscription:
            lines = (p
                     |
                     ReadFromPubSub(subscription=known_args.input_subscription
                                    ).with_output_types(bytes))
        else:
            lines = (p
                     | ReadFromText(known_args.input,
                                    coder=coders.BytesCoder()))

        output = (lines | 'parse_filter' >> beam.ParDo(ParseAndFilterDoFn()))
        # | 'parse' >> (beam.Map(parse_one_record)))

        # output | WriteToText(known_args.output)
        output | WriteToBigQuery(
            table_spec,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
Пример #17
0
 def test_process_single(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file(path, FOO_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo']))
Пример #18
0
 def test_process_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #19
0
 def test_process_glob(self):
   with TempDir() as temp_dir:
     self._write_glob(temp_dir, 'result')
     glob = temp_dir.get_path() + os.path.sep + '*result'
     with TestPipeline() as p:
       result = (p
                 | Create([glob])
                 | ReadAllFromTFRecord(
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.AUTO))
       assert_that(result, equal_to([b'foo', b'bar'] * 3))
Пример #20
0
 def test_process_auto(self):
   with TempDir() as temp_dir:
     path = temp_dir.create_temp_file('result.gz')
     _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
       result = (p
                 | Create([path])
                 | ReadAllFromTFRecord(
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.AUTO))
       assert_that(result, equal_to([b'foo', b'bar']))
Пример #21
0
    def __init__(
        self,
        topic: str,
        id_label: Optional[str],
        timestamp_attribute: Optional[str],
    ):
        self.coder = coders.BytesCoder()
        self.full_topic = topic
        self.id_label = id_label
        self.timestamp_attribute = timestamp_attribute

        self.project, self.topic_name = parse_topic(topic)
Пример #22
0
 def test_process_gzip(self):
   with TempDir() as temp_dir:
     path = temp_dir.create_temp_file('result')
     _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
       result = (p
                 | ReadFromTFRecord(
                     path,
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.GZIP,
                     validate=True))
       assert_that(result, equal_to([b'foo', b'bar']))
Пример #23
0
 def __init__(self,
              coder=coders.BytesCoder(),
              file_name_suffix='',
              num_shards=0,
              shard_name_template=None,
              compression_type=CompressionTypes.AUTO,
              **kwargs):
   super(WriteToFile, self).__init__(**kwargs)
   self._sink = _WriteFullFileSink(
     'dummy', coder, file_name_suffix,
     num_shards, shard_name_template,
     compression_type)
Пример #24
0
 def test_process_gzip(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.io.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.GZIP,
                           validate=True)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Пример #25
0
 def __init__(self, iterable_side_input):
   # pylint: disable=protected-access
   side_input_data = iterable_side_input._side_input_data()
   assert side_input_data.access_pattern == common_urns.ITERABLE_SIDE_INPUT
   iterable_view_fn = side_input_data.view_fn
   self._data = beam.pvalue.SideInputData(
       self.DATAFLOW_MULTIMAP_URN,
       side_input_data.window_mapping_fn,
       lambda multimap: iterable_view_fn(multimap['']),
       coders.WindowedValueCoder(
           coders.TupleCoder((coders.BytesCoder(),
                              side_input_data.coder.wrapped_value_coder)),
           side_input_data.coder.window_coder))
Пример #26
0
    def test_write_record_multiple(self):
        path = os.path.join(self._new_tempdir(), 'result')
        record = binascii.a2b_base64(FOO_BAR_RECORD_BASE64)
        sink = _TFRecordSink(path,
                             coder=coders.BytesCoder(),
                             file_name_suffix='',
                             num_shards=0,
                             shard_name_template=None,
                             compression_type=CompressionTypes.UNCOMPRESSED)
        self._write_lines(sink, path, ['foo', 'bar'])

        with open(path, 'r') as f:
            self.assertEqual(f.read(), record)
Пример #27
0
    def __init__(
            self,
            topic,  # type: str
            id_label,  # type: Optional[str]
            with_attributes,  # type: bool
            timestamp_attribute  # type: Optional[str]
    ):
        self.coder = coders.BytesCoder()
        self.full_topic = topic
        self.id_label = id_label
        self.with_attributes = with_attributes
        self.timestamp_attribute = timestamp_attribute

        self.project, self.topic_name = parse_topic(topic)
Пример #28
0
  def test_process_multiple_globs(self):
    with TempDir() as temp_dir:
      globs = []
      for i in range(3):
        suffix = 'result' + str(i)
        self._write_glob(temp_dir, suffix)
        globs.append(temp_dir.get_path() + os.path.sep + '*' + suffix)

      with TestPipeline() as p:
        result = (p
                  | Create(globs)
                  | ReadAllFromTFRecord(
                      coder=coders.BytesCoder(),
                      compression_type=CompressionTypes.AUTO))
        assert_that(result, equal_to([b'foo', b'bar'] * 9))
Пример #29
0
  def test_write_record_single(self):
    with TempDir() as temp_dir:
      path = temp_dir.create_temp_file('result')
      record = binascii.a2b_base64(FOO_RECORD_BASE64)
      sink = _TFRecordSink(
          path,
          coder=coders.BytesCoder(),
          file_name_suffix='',
          num_shards=0,
          shard_name_template=None,
          compression_type=CompressionTypes.UNCOMPRESSED)
      self._write_lines(sink, path, [b'foo'])

      with open(path, 'rb') as f:
        self.assertEqual(f.read(), record)
Пример #30
0
 def __init__(
   self,
   file_pattern,
   coder=coders.BytesCoder(),
   compression_type=CompressionTypes.AUTO,
   validate=True,
   output_filename=True,
   output_content=True,
   **kwargs):
   super(_ReadFullFile, self).__init__(**kwargs)
   self._source = _ReadFullFileSource(
     file_pattern, coder, compression_type,
     validate,
     output_filename=output_filename,
     output_content=output_content)