def _get_coder(self, typehint, window_coder): """Returns a coder based on a typehint object.""" if window_coder: return coders.WindowedValueCoder( coders.registry.get_coder(typehint), coders.TimestampCoder(), window_coder) else: return coders.registry.get_coder(typehint)
def get_coder_from_spec(coder_spec, kv_pair=False): """Return a coder instance from a coder spec. Args: coder_spec: A dict where the value of the '@type' key is a pickled instance of a Coder instance. kv_pair: True if a 2-tuple of coders (key and value) must be returned. Returns: A coder instance (has encode/decode methods). It is possible to return a 2-tuple of (key coder, value coder) if the spec is for a shuffle source or sink. Such shuffle source and sinks can take a 2-tuple of coders as parameter. Raises: ValueError: if KV coder requested but coder spec is not of a KV coder. """ assert coder_spec is not None # Ignore the wrappers in these encodings. ignored_wrappers = ( 'kind:stream', 'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder' ) if coder_spec['@type'] in ignored_wrappers: assert len(coder_spec['component_encodings']) == 1 coder_spec = coder_spec['component_encodings'][0] return get_coder_from_spec(coder_spec, kv_pair=kv_pair) # We pass coders in the form "<coder_name>$<pickled_data>" to make the job # description JSON more readable. coder = coders.deserialize_coder(coder_spec['@type']) # If this is a coder with components potentially modified by the service, # use these components. # # TODO(ccy): This is necessary since the service may move around the # wrapped types of WindowedValueCoders and TupleCoders. We should refactor # coder serialization so these special cases is not necessary. if isinstance(coder, coders.WindowedValueCoder): value_coder, timestamp_coder, window_coder = [ get_coder_from_spec(c) for c in coder_spec['component_encodings'] ] coder = coders.WindowedValueCoder(value_coder, timestamp_coder, window_coder) elif isinstance(coder, coders.TupleCoder): component_coders = [ get_coder_from_spec(c) for c in coder_spec['component_encodings'] ] coder = coders.TupleCoder(component_coders) if kv_pair: if not coder.is_kv_coder(): raise ValueError('Coder is not a KV coder: %s.' % coder) return coder.key_coder(), coder.value_coder() else: return coder
def run_Create(self, transform_node): transform = transform_node.transform step = self._add_step(TransformNames.CREATE_PCOLLECTION, transform_node.full_label, transform_node) # TODO(silviuc): Eventually use a coder based on typecoders. # Note that we base64-encode values here so that the service will accept # the values. element_coder = coders.PickleCoder() step.add_property(PropertyNames.ELEMENT, [ base64.b64encode(element_coder.encode(v)) for v in transform.value ]) # The service expects a WindowedValueCoder here, so we wrap the actual # encoding in a WindowedValueCoder. step.encoding = self._get_cloud_encoding( coders.WindowedValueCoder(element_coder)) step.add_property(PropertyNames.OUTPUT_INFO, [{ PropertyNames.USER_NAME: ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)), PropertyNames.ENCODING: step.encoding, PropertyNames.OUTPUT_NAME: PropertyNames.OUT }])
}, 'encoding': { 'component_encodings': [{ '@type': 'notused' }, { '@type': 'notused' }], '@type': coders.serialize_coder(coders.PickleCoder()) } }], '@type': 'ConcatSource' } CODER = coders.PickleCoder() WINDOWED_CODER = coders.WindowedValueCoder(CODER) CODER_SPEC = CODER.as_cloud_object() WINDOWED_CODER_SPEC = WINDOWED_CODER.as_cloud_object() def add_source_codec_spec(target): target.source.codec = dataflow.Source.CodecValue() for k, v in CODER_SPEC.iteritems(): target.source.codec.additionalProperties.append( dataflow.Source.CodecValue.AdditionalProperty( key=k, value=to_json_value(v))) def add_source_windowed_codec_spec(target): target.source.codec = dataflow.Source.CodecValue()