def test_serialization(self): context = pipeline_context.PipelineContext() float_coder_ref = context.coders.get_id(coders.FloatCoder()) bytes_coder_ref = context.coders.get_id(coders.BytesCoder()) proto = context.to_runner_api() context2 = pipeline_context.PipelineContext.from_runner_api(proto) self.assertEqual(coders.FloatCoder(), context2.coders.get_by_id(float_coder_ref)) self.assertEqual(coders.BytesCoder(), context2.coders.get_by_id(bytes_coder_ref))
def test_common_id_assignment(self): context = pipeline_context.PipelineContext() float_coder_ref = context.coders.get_id(coders.FloatCoder()) bytes_coder_ref = context.coders.get_id(coders.BytesCoder()) context2 = pipeline_context.PipelineContext( component_id_map=context.component_id_map) bytes_coder_ref2 = context2.coders.get_id(coders.BytesCoder()) float_coder_ref2 = context2.coders.get_id(coders.FloatCoder()) self.assertEqual(bytes_coder_ref, bytes_coder_ref2) self.assertEqual(float_coder_ref, float_coder_ref2)
def test_str_utf8_coder(self): real_coder = coders.registry.get_coder(str) expected_coder = coders.BytesCoder() self.assertEqual(real_coder.encode('abc'), expected_coder.encode('abc')) self.assertEqual('abc', real_coder.decode(real_coder.encode('abc'))) real_coder = coders.registry.get_coder(bytes) expected_coder = coders.BytesCoder() self.assertEqual(real_coder.encode('abc'), expected_coder.encode('abc')) self.assertEqual('abc', real_coder.decode(real_coder.encode('abc')))
def __init__(self, topic=None, subscription=None, id_label=None): # we are using this coder explicitly for portability reasons of PubsubIO # across implementations in languages. self.coder = coders.BytesCoder() self.topic = topic self.subscription = subscription self.id_label = id_label
def __init__(self, topic): # we are using this coder explicitly for portability reasons of PubsubIO # across implementations in languages. self.coder = coders.BytesCoder() self.full_topic = topic self.project, self.topic_name = parse_topic(topic)
def __init__(self, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, with_filename=False): """Initialize the ``ReadAllFromTFRecord`` transform. Args: coder: Coder used to decode each record. compression_type: Used to handle compressed input files. Default value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. with_filename: If True, returns a Key Value with the key being the file name and the value being the actual data. If False, it only returns the data. """ super(ReadAllFromTFRecord, self).__init__() source_from_file = partial(_create_tfrecordio_source, compression_type=compression_type, coder=coder) # Desired and min bundle sizes do not matter since TFRecord files are # unsplittable. self._read_all_files = ReadAllFiles(splittable=False, compression_type=compression_type, desired_bundle_size=0, min_bundle_size=0, source_from_file=source_from_file, with_filename=with_filename)
def get_coder_from_spec(coder_spec): """Return a coder instance from a coder spec. Args: coder_spec: A dict where the value of the '@type' key is a pickled instance of a Coder instance. Returns: A coder instance (has encode/decode methods). """ assert coder_spec is not None # Ignore the wrappers in these encodings. ignored_wrappers = ( 'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder') if coder_spec['@type'] in ignored_wrappers: assert len(coder_spec['component_encodings']) == 1 coder_spec = coder_spec['component_encodings'][0] return get_coder_from_spec(coder_spec) # Handle a few well known types of coders. if coder_spec['@type'] == 'kind:pair': assert len(coder_spec['component_encodings']) == 2 component_coders = [ get_coder_from_spec(c) for c in coder_spec['component_encodings']] return coders.TupleCoder(component_coders) elif coder_spec['@type'] == 'kind:stream': assert len(coder_spec['component_encodings']) == 1 return coders.IterableCoder( get_coder_from_spec(coder_spec['component_encodings'][0])) elif coder_spec['@type'] == 'kind:windowed_value': assert len(coder_spec['component_encodings']) == 2 value_coder, window_coder = [ get_coder_from_spec(c) for c in coder_spec['component_encodings']] return coders.coders.WindowedValueCoder( value_coder, window_coder=window_coder) elif coder_spec['@type'] == 'kind:interval_window': assert ('component_encodings' not in coder_spec or not coder_spec['component_encodings']) return coders.coders.IntervalWindowCoder() elif coder_spec['@type'] == 'kind:global_window': assert ('component_encodings' not in coder_spec or not coder_spec['component_encodings']) return coders.coders.GlobalWindowCoder() elif coder_spec['@type'] == 'kind:varint': assert ('component_encodings' not in coder_spec or len(coder_spec['component_encodings'] == 0)) return coders.coders.VarIntCoder() elif coder_spec['@type'] == 'kind:length_prefix': assert len(coder_spec['component_encodings']) == 1 return coders.coders.LengthPrefixCoder( get_coder_from_spec(coder_spec['component_encodings'][0])) elif coder_spec['@type'] == 'kind:bytes': assert ('component_encodings' not in coder_spec or len(coder_spec['component_encodings'] == 0)) return coders.BytesCoder() # We pass coders in the form "<coder_name>$<pickled_data>" to make the job # description JSON more readable. return coders.coders.deserialize_coder(coder_spec['@type'])
def __init__(self, file_path_prefix, coder=coders.BytesCoder(), file_name_suffix='', num_shards=0, shard_name_template=None, compression_type=CompressionTypes.AUTO): """Initialize WriteToTFRecord transform. Args: file_path_prefix: The file path to write to. The files written will begin with this prefix, followed by a shard identifier (see num_shards), and end in a common extension, if given by file_name_suffix. coder: Coder used to encode each record. file_name_suffix: Suffix for the files written. num_shards: The number of files (shards) used for output. If not set, the default value will be used. shard_name_template: A template string containing placeholders for the shard number and shard count. When constructing a filename for a particular shard number, the upper-case letters 'S' and 'N' are replaced with the 0-padded shard number and shard count respectively. This argument can be '' in which case it behaves as if num_shards was set to 1 and only one file will be generated. The default pattern used is '-SSSSS-of-NNNNN' if None is passed as the shard_name_template. compression_type: Used to handle compressed output files. Typical value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. Returns: A WriteToTFRecord transform object. """ super(WriteToTFRecord, self).__init__() self._sink = _TFRecordSink(file_path_prefix, coder, file_name_suffix, num_shards, shard_name_template, compression_type)
def test_deduplication_by_proto(self): context = pipeline_context.PipelineContext() bytes_coder_proto = coders.BytesCoder().to_runner_api(None) bytes_coder_ref = context.coders.get_by_proto(bytes_coder_proto) bytes_coder_ref2 = context.coders.get_by_proto( bytes_coder_proto, deduplicate=True) self.assertEqual(bytes_coder_ref, bytes_coder_ref2)
def __init__(self, topic=None, subscription=None, id_label=None, with_attributes=False, timestamp_attribute=None): # We are using this coder explicitly for portability reasons of PubsubIO # across implementations in languages. self.coder = coders.BytesCoder() self.full_topic = topic self.full_subscription = subscription self.topic_name = None self.subscription_name = None self.id_label = id_label self.with_attributes = with_attributes self.timestamp_attribute = timestamp_attribute # Perform some validation on the topic and subscription. if not (topic or subscription): raise ValueError( 'Either a topic or subscription must be provided.') if topic and subscription: raise ValueError( 'Only one of topic or subscription should be provided.') if topic: self.project, self.topic_name = parse_topic(topic) if subscription: self.project, self.subscription_name = parse_subscription( subscription)
def __init__(self, file_pattern, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, validate=True, **kwargs): """Initialize a ReadFromTFRecord transform. Args: file_pattern: A file glob pattern to read TFRecords from. coder: Coder used to decode each record. compression_type: Used to handle compressed input files. Default value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. validate: Boolean flag to verify that the files exist during the pipeline creation time. **kwargs: optional args dictionary. These are passed through to parent constructor. Returns: A ReadFromTFRecord transform object. """ super(ReadFromTFRecord, self).__init__(**kwargs) self._source = _TFRecordSource(file_pattern, coder, compression_type, validate)
def __init__( self, topic=None, # type: Optional[str] subscription=None, # type: Optional[str] id_label=None, # type: Optional[str] with_attributes=False, # type: bool timestamp_attribute=None # type: Optional[str] ): self.coder = coders.BytesCoder() self.full_topic = topic self.full_subscription = subscription self.topic_name = None self.subscription_name = None self.id_label = id_label self.with_attributes = with_attributes self.timestamp_attribute = timestamp_attribute # Perform some validation on the topic and subscription. if not (topic or subscription): raise ValueError( 'Either a topic or subscription must be provided.') if topic and subscription: raise ValueError( 'Only one of topic or subscription should be provided.') if topic: self.project, self.topic_name = parse_topic(topic) if subscription: self.project, self.subscription_name = parse_subscription( subscription)
def __init__(self, components, known_runner_urns, use_state_iterables=False): self.components = components self.known_runner_urns = known_runner_urns self.use_state_iterables = use_state_iterables self.safe_coders = {} self.bytes_coder_id = self.add_or_get_coder_id( coders.BytesCoder().to_runner_api(None), 'bytes_coder')
def __init__(self, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, **kwargs): """Initialize the ``ReadAllFromTFRecord`` transform. Args: coder: Coder used to decode each record. compression_type: Used to handle compressed input files. Default value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. **kwargs: optional args dictionary. These are passed through to parent constructor. """ super(ReadAllFromTFRecord, self).__init__(**kwargs) source_from_file = partial(_create_tfrcordio_source, compression_type=compression_type, coder=coder) # Desired and min bundle sizes do not matter since TFRecord files are # unsplittable. self._read_all_files = ReadAllFiles(splittable=False, compression_type=compression_type, desired_bundle_size=0, min_bundle_size=0, source_from_file=source_from_file)
def __init__(self, topic, id_label, with_attributes, timestamp_attribute): self.coder = coders.BytesCoder() self.full_topic = topic self.id_label = id_label self.with_attributes = with_attributes self.timestamp_attribute = timestamp_attribute self.project, self.topic_name = parse_topic(topic)
def run(argv=None): """ Main entry point, define and run the pipeline """ parser = argparse.ArgumentParser( description='Run Apache Beam to process the logs') parser.add_argument('--input', dest='input', help='Input file to process') parser.add_argument('--output', dest='output', help='Output file to write results to') parser.add_argument( '--input_subscription', dest='input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument( '--output_table', dest='output_table', help=('BigQuery Table to write results to, with the form ' '<PROJECT>:<DATASET>.<TABLE>')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True print('pipeline options:', pipeline_options) # Specification for table in BigQuery table_spec = args.output_table table_schema = 'host:STRING, utc_timestamp:TIMESTAMP, action:STRING, uri:STRING, protocol:STRING, status:STRING, size:INTEGER' with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. if known_args.input_subscription: lines = (p | ReadFromPubSub(subscription=known_args.input_subscription ).with_output_types(bytes)) else: lines = (p | ReadFromText(known_args.input, coder=coders.BytesCoder())) output = (lines | 'parse_filter' >> beam.ParDo(ParseAndFilterDoFn())) # | 'parse' >> (beam.Map(parse_one_record))) # output | WriteToText(known_args.output) output | WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
def test_process_single(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file(path, FOO_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo']))
def test_process_auto(self): path = os.path.join(self._new_tempdir(), 'result.gz') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with beam.Pipeline(DirectRunner()) as p: result = (p | beam.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=fileio.CompressionTypes.AUTO))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def test_process_glob(self): with TempDir() as temp_dir: self._write_glob(temp_dir, 'result') glob = temp_dir.get_path() + os.path.sep + '*result' with TestPipeline() as p: result = (p | Create([glob]) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar'] * 3))
def test_process_auto(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result.gz') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | Create([path]) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar']))
def __init__( self, topic: str, id_label: Optional[str], timestamp_attribute: Optional[str], ): self.coder = coders.BytesCoder() self.full_topic = topic self.id_label = id_label self.timestamp_attribute = timestamp_attribute self.project, self.topic_name = parse_topic(topic)
def test_process_gzip(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | ReadFromTFRecord( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.GZIP, validate=True)) assert_that(result, equal_to([b'foo', b'bar']))
def __init__(self, coder=coders.BytesCoder(), file_name_suffix='', num_shards=0, shard_name_template=None, compression_type=CompressionTypes.AUTO, **kwargs): super(WriteToFile, self).__init__(**kwargs) self._sink = _WriteFullFileSink( 'dummy', coder, file_name_suffix, num_shards, shard_name_template, compression_type)
def test_process_gzip(self): path = os.path.join(self._new_tempdir(), 'result') self._write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | beam.io.Read( _TFRecordSource( path, coder=coders.BytesCoder(), compression_type=CompressionTypes.GZIP, validate=True))) beam.assert_that(result, beam.equal_to(['foo', 'bar']))
def __init__(self, iterable_side_input): # pylint: disable=protected-access side_input_data = iterable_side_input._side_input_data() assert side_input_data.access_pattern == common_urns.ITERABLE_SIDE_INPUT iterable_view_fn = side_input_data.view_fn self._data = beam.pvalue.SideInputData( self.DATAFLOW_MULTIMAP_URN, side_input_data.window_mapping_fn, lambda multimap: iterable_view_fn(multimap['']), coders.WindowedValueCoder( coders.TupleCoder((coders.BytesCoder(), side_input_data.coder.wrapped_value_coder)), side_input_data.coder.window_coder))
def test_write_record_multiple(self): path = os.path.join(self._new_tempdir(), 'result') record = binascii.a2b_base64(FOO_BAR_RECORD_BASE64) sink = _TFRecordSink(path, coder=coders.BytesCoder(), file_name_suffix='', num_shards=0, shard_name_template=None, compression_type=CompressionTypes.UNCOMPRESSED) self._write_lines(sink, path, ['foo', 'bar']) with open(path, 'r') as f: self.assertEqual(f.read(), record)
def __init__( self, topic, # type: str id_label, # type: Optional[str] with_attributes, # type: bool timestamp_attribute # type: Optional[str] ): self.coder = coders.BytesCoder() self.full_topic = topic self.id_label = id_label self.with_attributes = with_attributes self.timestamp_attribute = timestamp_attribute self.project, self.topic_name = parse_topic(topic)
def test_process_multiple_globs(self): with TempDir() as temp_dir: globs = [] for i in range(3): suffix = 'result' + str(i) self._write_glob(temp_dir, suffix) globs.append(temp_dir.get_path() + os.path.sep + '*' + suffix) with TestPipeline() as p: result = (p | Create(globs) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar'] * 9))
def test_write_record_single(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result') record = binascii.a2b_base64(FOO_RECORD_BASE64) sink = _TFRecordSink( path, coder=coders.BytesCoder(), file_name_suffix='', num_shards=0, shard_name_template=None, compression_type=CompressionTypes.UNCOMPRESSED) self._write_lines(sink, path, [b'foo']) with open(path, 'rb') as f: self.assertEqual(f.read(), record)
def __init__( self, file_pattern, coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO, validate=True, output_filename=True, output_content=True, **kwargs): super(_ReadFullFile, self).__init__(**kwargs) self._source = _ReadFullFileSource( file_pattern, coder, compression_type, validate, output_filename=output_filename, output_content=output_content)