Пример #1
0
 def expand(self, pcoll):
     return (pcoll
             | WindowInto(window.GlobalWindows())
             | "ToVoidKey" >> Map(lambda v: (None, v))
             | "Group" >> GroupByKey()
             | "UnKey" >> Map(lambda (k, v): v)
             | "Match" >> Map(matcher))
Пример #2
0
    def expand(self, pcoll):
        class ReifyTimestamps(DoFn):
            def process(self, element, timestamp=DoFn.TimestampParam):
                yield element[0], TimestampedValue(element[1], timestamp)

        class RestoreTimestamps(DoFn):
            def process(self, element, window=DoFn.WindowParam):
                # Pass the current window since _IdentityWindowFn wouldn't know how
                # to generate it.
                yield windowed_value.WindowedValue(
                    (element[0], element[1].value), element[1].timestamp,
                    [window])

        windowing_saved = pcoll.windowing
        # The linter is confused.
        # pylint: disable=abstract-class-instantiated
        result = (
            pcoll
            | ParDo(ReifyTimestamps())
            | 'IdentityWindow' >> WindowInto(
                _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()),
                trigger=AfterCount(1),
                accumulation_mode=AccumulationMode.DISCARDING,
                timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST,
            )
            | GroupByKey()
            | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value)
                                                     for value in e[1]])
            | ParDo(RestoreTimestamps()))
        result._windowing = windowing_saved
        return result
Пример #3
0
    def expand(self, pcolls):
        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for pcoll in pcolls.values():
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        tags = list(pcolls.keys())

        def add_tag(tag):
            return lambda k, v: (k, (tag, v))

        def collect_values(key, tagged_values):
            grouped_values = {tag: [] for tag in tags}
            for tag, value in tagged_values:
                grouped_values[tag].append(value)
            return key, grouped_values

        return ([
            pcoll
            | 'Tag[%s]' % tag >> MapTuple(add_tag(tag))
            for (tag, pcoll) in pcolls.items()
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | MapTuple(collect_values))
Пример #4
0
    def expand(self, pcoll):
        windowing_saved = pcoll.windowing
        if windowing_saved.is_default():
            # In this (common) case we can use a trivial trigger driver
            # and avoid the (expensive) window param.
            globally_windowed = window.GlobalWindows.windowed_value(None)
            MIN_TIMESTAMP = window.MIN_TIMESTAMP

            def reify_timestamps(element, timestamp=DoFn.TimestampParam):
                key, value = element
                if timestamp == MIN_TIMESTAMP:
                    timestamp = None
                return key, (value, timestamp)

            def restore_timestamps(element):
                key, values = element
                return [
                    globally_windowed.with_value((key, value)) if
                    timestamp is None else window.GlobalWindows.windowed_value(
                        (key, value), timestamp)
                    for (value, timestamp) in values
                ]
        else:

            # typing: All conditional function variants must have identical signatures
            def reify_timestamps(  # type: ignore[misc]
                    element,
                    timestamp=DoFn.TimestampParam,
                    window=DoFn.WindowParam):
                key, value = element
                # Transport the window as part of the value and restore it later.
                return key, windowed_value.WindowedValue(
                    value, timestamp, [window])

            def restore_timestamps(element):
                key, windowed_values = element
                return [
                    wv.with_value((key, wv.value)) for wv in windowed_values
                ]

        ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

        # TODO(BEAM-8104) Using global window as one of the standard window.
        # This is to mitigate the Dataflow Java Runner Harness limitation to
        # accept only standard coders.
        ungrouped._windowing = Windowing(
            window.GlobalWindows(),
            triggerfn=Always(),
            accumulation_mode=AccumulationMode.DISCARDING,
            timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
        result = (ungrouped
                  | GroupByKey()
                  | FlatMap(restore_timestamps).with_output_types(Any))
        result._windowing = windowing_saved
        return result
Пример #5
0
  def expand(self, pcoll):
    windowing_saved = pcoll.windowing
    if windowing_saved.is_default():
      # In this (common) case we can use a trivial trigger driver
      # and avoid the (expensive) window param.
      globally_windowed = window.GlobalWindows.windowed_value(None)
      window_fn = window.GlobalWindows()
      MIN_TIMESTAMP = window.MIN_TIMESTAMP

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        if timestamp == MIN_TIMESTAMP:
          timestamp = None
        return key, (value, timestamp)

      def restore_timestamps(element):
        key, values = element
        return [
            globally_windowed.with_value((key, value))
            if timestamp is None
            else window.GlobalWindows.windowed_value((key, value), timestamp)
            for (value, timestamp) in values]

    else:
      # The linter is confused.
      # hash(1) is used to force "runtime" selection of _IdentityWindowFn
      # pylint: disable=abstract-class-instantiated
      cls = hash(1) and _IdentityWindowFn
      window_fn = cls(
          windowing_saved.windowfn.get_window_coder())

      def reify_timestamps(element, timestamp=DoFn.TimestampParam):
        key, value = element
        return key, TimestampedValue(value, timestamp)

      def restore_timestamps(element, window=DoFn.WindowParam):
        # Pass the current window since _IdentityWindowFn wouldn't know how
        # to generate it.
        key, values = element
        return [
            windowed_value.WindowedValue(
                (key, value.value), value.timestamp, [window])
            for value in values]

    ungrouped = pcoll | Map(reify_timestamps)
    ungrouped._windowing = Windowing(
        window_fn,
        triggerfn=AfterCount(1),
        accumulation_mode=AccumulationMode.DISCARDING,
        timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
    result = (ungrouped
              | GroupByKey()
              | FlatMap(restore_timestamps))
    result._windowing = windowing_saved
    return result
Пример #6
0
    def expand(self, pvalue):
        keyed_pc = (pvalue | 'AssignKey' >> Map(lambda x: (uuid.uuid4(), x)))
        if keyed_pc.windowing.windowfn.is_merging():
            raise ValueError(
                'Transform ReadAllFiles cannot be used in the presence '
                'of merging windows')
        if not isinstance(keyed_pc.windowing.triggerfn, DefaultTrigger):
            raise ValueError(
                'Transform ReadAllFiles cannot be used in the presence '
                'of non-trivial triggers')

        return (keyed_pc | 'GroupByKey' >> GroupByKey()
                # Using FlatMap below due to the possibility of key collisions.
                | 'DropKey' >> FlatMap(lambda (k, values): values))
Пример #7
0
    def expand(self, pcolls):
        """Performs CoGroupByKey on argument pcolls; see class docstring."""

        # For associating values in K-V pairs with the PCollections they came from.
        def _pair_tag_with_value(key_value, tag):
            (key, value) = key_value
            return (key, (tag, value))

        # Creates the key, value pairs for the output PCollection. Values are either
        # lists or dicts (per the class docstring), initialized by the result of
        # result_ctor(result_ctor_arg).
        def _merge_tagged_vals_under_key(key_grouped, result_ctor,
                                         result_ctor_arg):
            (key, grouped) = key_grouped
            result_value = result_ctor(result_ctor_arg)
            for tag, value in grouped:
                result_value[tag].append(value)
            return (key, result_value)

        try:
            # If pcolls is a dict, we turn it into (tag, pcoll) pairs for use in the
            # general-purpose code below. The result value constructor creates dicts
            # whose keys are the tags.
            result_ctor_arg = list(pcolls)
            result_ctor = lambda tags: dict((tag, []) for tag in tags)
            pcolls = pcolls.items()
        except AttributeError:
            # Otherwise, pcolls is a list/tuple, so we turn it into (index, pcoll)
            # pairs. The result value constructor makes tuples with len(pcolls) slots.
            pcolls = list(enumerate(pcolls))
            result_ctor_arg = len(pcolls)
            result_ctor = lambda size: tuple([] for _ in range(size))

        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for _, pcoll in pcolls:
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        return ([
            pcoll | 'pair_with_%s' % tag >> Map(_pair_tag_with_value, tag)
            for tag, pcoll in pcolls
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | Map(_merge_tagged_vals_under_key, result_ctor,
                      result_ctor_arg))
Пример #8
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)

        absl.logging.info('Hello Component - Executor - Do Start')

        assert (len(input_dict['input_data']) == 1)
        for artifact in input_dict['input_data']:
            input_dir = artifact.uri
            output_dir = artifact_utils.get_single_uri(
                output_dict['output_data'])

            input_uri = io_utils.all_files_pattern(input_dir)
            output_uri = os.path.join(output_dir, 'result.csv')

            with self._make_beam_pipeline() as p:
                intrim = p | 'ReadData' >> beam.io.ReadFromTFRecord(
                    file_pattern=input_uri,
                    coder=beam.coders.ProtoCoder(
                        prediction_log_pb2.PredictionLog))
                intrim = intrim | 'Process' >> beam.Map(process_item)
                intrim = intrim | 'SameKey' >> beam.Map(lambda it: (0, it))
                intrim = intrim | 'SameWindow' >> beam.WindowInto(
                    beam.window.GlobalWindows())
                intrim = intrim | 'GroupAll' >> GroupByKey()
                intrim = intrim | 'RemoveDummyKey' >> beam.Map(
                    lambda item: item[1])
                intrim = intrim | 'SortAll' >> beam.Map(sort_data)
                intrim = intrim | 'InMemorySink' >> beam.Map(
                    lambda item: write_data(item, output_uri))

            # intrim | 'Sink' >> beam.io.WriteToText(file_path_prefix=output_uri,
            #                                          file_name_suffix='.csv',
            #                                          num_shards=1,
            #                                          # CompressionTypes.UNCOMPRESSED,
            #                                          header='ID_code,target')

        absl.logging.info('Hello Component - Executor - Do End')
Пример #9
0
            result_ctor_arg = len(pcolls)
            result_ctor = lambda size: tuple([] for _ in xrange(size))

        # Check input PCollections for PCollection-ness, and that they all belong
        # to the same pipeline.
        for _, pcoll in pcolls:
            self._check_pcollection(pcoll)
            if self.pipeline:
                assert pcoll.pipeline == self.pipeline

        return ([
            pcoll | Map('pair_with_%s' % tag, _pair_tag_with_value, tag)
            for tag, pcoll in pcolls
        ]
                | Flatten(pipeline=self.pipeline)
                | GroupByKey()
                | Map(_merge_tagged_vals_under_key, result_ctor,
                      result_ctor_arg))


def Keys(label='Keys'):  # pylint: disable=invalid-name
    """Produces a PCollection of first elements of 2-tuples in a PCollection."""
    return Map(label, lambda (k, v): k)


def Values(label='Values'):  # pylint: disable=invalid-name
    """Produces a PCollection of second elements of 2-tuples in a PCollection."""
    return Map(label, lambda (k, v): v)


def KvSwap(label='KvSwap'):  # pylint: disable=invalid-name