def expand(self, pcoll): return (pcoll | WindowInto(window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v)) | "Group" >> GroupByKey() | "UnKey" >> Map(lambda (k, v): v) | "Match" >> Map(matcher))
def expand(self, pcoll): class ReifyTimestamps(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield element[0], TimestampedValue(element[1], timestamp) class RestoreTimestamps(DoFn): def process(self, element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. yield windowed_value.WindowedValue( (element[0], element[1].value), element[1].timestamp, [window]) windowing_saved = pcoll.windowing # The linter is confused. # pylint: disable=abstract-class-instantiated result = ( pcoll | ParDo(ReifyTimestamps()) | 'IdentityWindow' >> WindowInto( _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()), trigger=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST, ) | GroupByKey() | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value) for value in e[1]]) | ParDo(RestoreTimestamps())) result._windowing = windowing_saved return result
def expand(self, pcolls): # Check input PCollections for PCollection-ness, and that they all belong # to the same pipeline. for pcoll in pcolls.values(): self._check_pcollection(pcoll) if self.pipeline: assert pcoll.pipeline == self.pipeline tags = list(pcolls.keys()) def add_tag(tag): return lambda k, v: (k, (tag, v)) def collect_values(key, tagged_values): grouped_values = {tag: [] for tag in tags} for tag, value in tagged_values: grouped_values[tag].append(value) return key, grouped_values return ([ pcoll | 'Tag[%s]' % tag >> MapTuple(add_tag(tag)) for (tag, pcoll) in pcolls.items() ] | Flatten(pipeline=self.pipeline) | GroupByKey() | MapTuple(collect_values))
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value( (key, value), timestamp) for (value, timestamp) in values ] else: # typing: All conditional function variants must have identical signatures def reify_timestamps( # type: ignore[misc] element, timestamp=DoFn.TimestampParam, window=DoFn.WindowParam): key, value = element # Transport the window as part of the value and restore it later. return key, windowed_value.WindowedValue( value, timestamp, [window]) def restore_timestamps(element): key, windowed_values = element return [ wv.with_value((key, wv.value)) for wv in windowed_values ] ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any) # TODO(BEAM-8104) Using global window as one of the standard window. # This is to mitigate the Dataflow Java Runner Harness limitation to # accept only standard coders. ungrouped._windowing = Windowing( window.GlobalWindows(), triggerfn=Always(), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps).with_output_types(Any)) result._windowing = windowing_saved return result
def expand(self, pcoll): windowing_saved = pcoll.windowing if windowing_saved.is_default(): # In this (common) case we can use a trivial trigger driver # and avoid the (expensive) window param. globally_windowed = window.GlobalWindows.windowed_value(None) window_fn = window.GlobalWindows() MIN_TIMESTAMP = window.MIN_TIMESTAMP def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element if timestamp == MIN_TIMESTAMP: timestamp = None return key, (value, timestamp) def restore_timestamps(element): key, values = element return [ globally_windowed.with_value((key, value)) if timestamp is None else window.GlobalWindows.windowed_value((key, value), timestamp) for (value, timestamp) in values] else: # The linter is confused. # hash(1) is used to force "runtime" selection of _IdentityWindowFn # pylint: disable=abstract-class-instantiated cls = hash(1) and _IdentityWindowFn window_fn = cls( windowing_saved.windowfn.get_window_coder()) def reify_timestamps(element, timestamp=DoFn.TimestampParam): key, value = element return key, TimestampedValue(value, timestamp) def restore_timestamps(element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. key, values = element return [ windowed_value.WindowedValue( (key, value.value), value.timestamp, [window]) for value in values] ungrouped = pcoll | Map(reify_timestamps) ungrouped._windowing = Windowing( window_fn, triggerfn=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST) result = (ungrouped | GroupByKey() | FlatMap(restore_timestamps)) result._windowing = windowing_saved return result
def expand(self, pvalue): keyed_pc = (pvalue | 'AssignKey' >> Map(lambda x: (uuid.uuid4(), x))) if keyed_pc.windowing.windowfn.is_merging(): raise ValueError( 'Transform ReadAllFiles cannot be used in the presence ' 'of merging windows') if not isinstance(keyed_pc.windowing.triggerfn, DefaultTrigger): raise ValueError( 'Transform ReadAllFiles cannot be used in the presence ' 'of non-trivial triggers') return (keyed_pc | 'GroupByKey' >> GroupByKey() # Using FlatMap below due to the possibility of key collisions. | 'DropKey' >> FlatMap(lambda (k, values): values))
def expand(self, pcolls): """Performs CoGroupByKey on argument pcolls; see class docstring.""" # For associating values in K-V pairs with the PCollections they came from. def _pair_tag_with_value(key_value, tag): (key, value) = key_value return (key, (tag, value)) # Creates the key, value pairs for the output PCollection. Values are either # lists or dicts (per the class docstring), initialized by the result of # result_ctor(result_ctor_arg). def _merge_tagged_vals_under_key(key_grouped, result_ctor, result_ctor_arg): (key, grouped) = key_grouped result_value = result_ctor(result_ctor_arg) for tag, value in grouped: result_value[tag].append(value) return (key, result_value) try: # If pcolls is a dict, we turn it into (tag, pcoll) pairs for use in the # general-purpose code below. The result value constructor creates dicts # whose keys are the tags. result_ctor_arg = list(pcolls) result_ctor = lambda tags: dict((tag, []) for tag in tags) pcolls = pcolls.items() except AttributeError: # Otherwise, pcolls is a list/tuple, so we turn it into (index, pcoll) # pairs. The result value constructor makes tuples with len(pcolls) slots. pcolls = list(enumerate(pcolls)) result_ctor_arg = len(pcolls) result_ctor = lambda size: tuple([] for _ in range(size)) # Check input PCollections for PCollection-ness, and that they all belong # to the same pipeline. for _, pcoll in pcolls: self._check_pcollection(pcoll) if self.pipeline: assert pcoll.pipeline == self.pipeline return ([ pcoll | 'pair_with_%s' % tag >> Map(_pair_tag_with_value, tag) for tag, pcoll in pcolls ] | Flatten(pipeline=self.pipeline) | GroupByKey() | Map(_merge_tagged_vals_under_key, result_ctor, result_ctor_arg))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) absl.logging.info('Hello Component - Executor - Do Start') assert (len(input_dict['input_data']) == 1) for artifact in input_dict['input_data']: input_dir = artifact.uri output_dir = artifact_utils.get_single_uri( output_dict['output_data']) input_uri = io_utils.all_files_pattern(input_dir) output_uri = os.path.join(output_dir, 'result.csv') with self._make_beam_pipeline() as p: intrim = p | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=input_uri, coder=beam.coders.ProtoCoder( prediction_log_pb2.PredictionLog)) intrim = intrim | 'Process' >> beam.Map(process_item) intrim = intrim | 'SameKey' >> beam.Map(lambda it: (0, it)) intrim = intrim | 'SameWindow' >> beam.WindowInto( beam.window.GlobalWindows()) intrim = intrim | 'GroupAll' >> GroupByKey() intrim = intrim | 'RemoveDummyKey' >> beam.Map( lambda item: item[1]) intrim = intrim | 'SortAll' >> beam.Map(sort_data) intrim = intrim | 'InMemorySink' >> beam.Map( lambda item: write_data(item, output_uri)) # intrim | 'Sink' >> beam.io.WriteToText(file_path_prefix=output_uri, # file_name_suffix='.csv', # num_shards=1, # # CompressionTypes.UNCOMPRESSED, # header='ID_code,target') absl.logging.info('Hello Component - Executor - Do End')
result_ctor_arg = len(pcolls) result_ctor = lambda size: tuple([] for _ in xrange(size)) # Check input PCollections for PCollection-ness, and that they all belong # to the same pipeline. for _, pcoll in pcolls: self._check_pcollection(pcoll) if self.pipeline: assert pcoll.pipeline == self.pipeline return ([ pcoll | Map('pair_with_%s' % tag, _pair_tag_with_value, tag) for tag, pcoll in pcolls ] | Flatten(pipeline=self.pipeline) | GroupByKey() | Map(_merge_tagged_vals_under_key, result_ctor, result_ctor_arg)) def Keys(label='Keys'): # pylint: disable=invalid-name """Produces a PCollection of first elements of 2-tuples in a PCollection.""" return Map(label, lambda (k, v): k) def Values(label='Values'): # pylint: disable=invalid-name """Produces a PCollection of second elements of 2-tuples in a PCollection.""" return Map(label, lambda (k, v): v) def KvSwap(label='KvSwap'): # pylint: disable=invalid-name