示例#1
0
class AfterCount(TriggerFn):
  """Fire when there are at least count elements in this window pane.

  AfterCount is experimental. No backwards compatibility guarantees.
  """

  COUNT_TAG = _CombiningValueStateTag('count', combiners.CountCombineFn())

  def __init__(self, count):
    if not isinstance(count, numbers.Integral) or count < 1:
      raise ValueError("count (%d) must be a positive integer." % count)
    self.count = count

  def __repr__(self):
    return 'AfterCount(%s)' % self.count

  def __eq__(self, other):
    return type(self) == type(other) and self.count == other.count

  def __hash__(self):
    return hash(self.count)

  def on_element(self, element, window, context):
    context.add_state(self.COUNT_TAG, 1)

  def on_merge(self, to_be_merged, merge_result, context):
    # states automatically merged
    pass

  def should_fire(self, time_domain, watermark, window, context):
    return context.get_state(self.COUNT_TAG) >= self.count

  def on_fire(self, watermark, window, context):
    return True

  def reset(self, window, context):
    context.clear_state(self.COUNT_TAG)

  @staticmethod
  def from_runner_api(proto, unused_context):
    return AfterCount(proto.element_count.element_count)

  def to_runner_api(self, unused_context):
    return beam_runner_api_pb2.Trigger(
        element_count=beam_runner_api_pb2.Trigger.ElementCount(
            element_count=self.count))

  def has_ontime_pane(self):
    return False
示例#2
0
class AfterCount(TriggerFn):
    """Fire when there are at least count elements in this window pane."""

    COUNT_TAG = _CombiningValueStateTag('count', combiners.CountCombineFn())

    def __init__(self, count):
        self.count = count

    def __repr__(self):
        return 'AfterCount(%s)' % self.count

    def __eq__(self, other):
        return type(self) == type(other) and self.count == other.count

    def on_element(self, element, window, context):
        context.add_state(self.COUNT_TAG, 1)

    def on_merge(self, to_be_merged, merge_result, context):
        # states automatically merged
        pass

    def should_fire(self, watermark, window, context):
        return context.get_state(self.COUNT_TAG) >= self.count

    def on_fire(self, watermark, window, context):
        return True

    def reset(self, window, context):
        context.clear_state(self.COUNT_TAG)

    @staticmethod
    def from_runner_api(proto, unused_context):
        return AfterCount(proto.element_count.element_count)

    def to_runner_api(self, unused_context):
        return beam_runner_api_pb2.Trigger(
            element_count=beam_runner_api_pb2.Trigger.ElementCount(
                element_count=self.count))
示例#3
0
class GeneralTriggerDriver(TriggerDriver):
  """Breaks a series of bundle and timer firings into window (pane)s.

  Suitable for all variants of Windowing.
  """
  ELEMENTS = ListStateTag('elements')
  TOMBSTONE = CombiningValueStateTag('tombstone', combiners.CountCombineFn())

  def __init__(self, windowing):
    self.window_fn = windowing.windowfn
    self.output_time_fn_impl = OutputTimeFn.get_impl(windowing.output_time_fn,
                                                     self.window_fn)
    # pylint: disable=invalid-name
    self.WATERMARK_HOLD = WatermarkHoldStateTag('watermark',
                                                self.output_time_fn_impl)
    # pylint: enable=invalid-name
    self.trigger_fn = windowing.triggerfn
    self.accumulation_mode = windowing.accumulation_mode
    self.is_merging = True

  def process_elements(self, state, windowed_values, output_watermark):
    if self.is_merging:
      state = MergeableStateAdapter(state)

    windows_to_elements = collections.defaultdict(list)
    for wv in windowed_values:
      for window in wv.windows:
        windows_to_elements[window].append((wv.value, wv.timestamp))

    # First handle merging.
    if self.is_merging:
      old_windows = set(state.known_windows())
      all_windows = old_windows.union(windows_to_elements.keys())

      if all_windows != old_windows:
        merged_away = {}

        class TriggerMergeContext(WindowFn.MergeContext):

          def merge(_, to_be_merged, merge_result):  # pylint: disable=no-self-argument
            for window in to_be_merged:
              if window != merge_result:
                merged_away[window] = merge_result
            state.merge(to_be_merged, merge_result)
            # using the outer self argument.
            self.trigger_fn.on_merge(
                to_be_merged, merge_result, state.at(merge_result))

        self.window_fn.merge(TriggerMergeContext(all_windows))

        merged_windows_to_elements = collections.defaultdict(list)
        for window, values in windows_to_elements.items():
          while window in merged_away:
            window = merged_away[window]
          merged_windows_to_elements[window].extend(values)
        windows_to_elements = merged_windows_to_elements

        for window in merged_away:
          state.clear_state(window, self.WATERMARK_HOLD)

    # Next handle element adding.
    for window, elements in windows_to_elements.items():
      if state.get_state(window, self.TOMBSTONE):
        continue
      # Add watermark hold.
      # TODO(ccy): Add late data and garbage-collection hold support.
      output_time = self.output_time_fn_impl.merge(
          window,
          (element_output_time for element_output_time in
           (self.output_time_fn_impl.assign_output_time(window, timestamp)
            for unused_value, timestamp in elements)
           if element_output_time >= output_watermark))
      if output_time is not None:
        state.add_state(window, self.WATERMARK_HOLD, output_time)

      context = state.at(window)
      for value, unused_timestamp in elements:
        state.add_state(window, self.ELEMENTS, value)
        self.trigger_fn.on_element(value, window, context)

      # Maybe fire this window.
      watermark = MIN_TIMESTAMP
      if self.trigger_fn.should_fire(watermark, window, context):
        finished = self.trigger_fn.on_fire(watermark, window, context)
        yield self._output(window, finished, state)

  def process_timer(self, window_id, unused_name, time_domain, timestamp,
                    state):
    if self.is_merging:
      state = MergeableStateAdapter(state)
    window = state.get_window(window_id)
    if state.get_state(window, self.TOMBSTONE):
      return
    if time_domain == TimeDomain.WATERMARK:
      if not self.is_merging or window in state.known_windows():
        context = state.at(window)
        if self.trigger_fn.should_fire(timestamp, window, context):
          finished = self.trigger_fn.on_fire(timestamp, window, context)
          yield self._output(window, finished, state)
    else:
      raise Exception('Unexpected time domain: %s' % time_domain)

  def _output(self, window, finished, state):
    """Output window and clean up if appropriate."""

    values = state.get_state(window, self.ELEMENTS)
    if finished:
      # TODO(robertwb): allowed lateness
      state.clear_state(window, self.ELEMENTS)
      state.add_state(window, self.TOMBSTONE, 1)
    elif self.accumulation_mode == AccumulationMode.DISCARDING:
      state.clear_state(window, self.ELEMENTS)

    timestamp = state.get_state(window, self.WATERMARK_HOLD)
    if timestamp is None:
      # If no watermark hold was set, output at end of window.
      timestamp = window.end
    else:
      state.clear_state(window, self.WATERMARK_HOLD)

    return WindowedValue(values, timestamp, (window,))
示例#4
0
class GeneralTriggerDriver(TriggerDriver):
  """Breaks a series of bundle and timer firings into window (pane)s.

  Suitable for all variants of Windowing.
  """
  ELEMENTS = _ListStateTag('elements')
  TOMBSTONE = _CombiningValueStateTag('tombstone', combiners.CountCombineFn())
  INDEX = _CombiningValueStateTag('index', combiners.CountCombineFn())
  NONSPECULATIVE_INDEX = _CombiningValueStateTag(
      'nonspeculative_index', combiners.CountCombineFn())

  def __init__(self, windowing, clock):
    self.clock = clock
    self.allowed_lateness = windowing.allowed_lateness
    self.window_fn = windowing.windowfn
    self.timestamp_combiner_impl = TimestampCombiner.get_impl(
        windowing.timestamp_combiner, self.window_fn)
    # pylint: disable=invalid-name
    self.WATERMARK_HOLD = _WatermarkHoldStateTag(
        'watermark', self.timestamp_combiner_impl)
    # pylint: enable=invalid-name
    self.trigger_fn = windowing.triggerfn
    self.accumulation_mode = windowing.accumulation_mode
    self.is_merging = True

  def process_elements(
      self,
      state,
      windowed_values,
      output_watermark,
      input_watermark=MIN_TIMESTAMP):
    if self.is_merging:
      state = MergeableStateAdapter(state)

    windows_to_elements = collections.defaultdict(list)
    for wv in windowed_values:
      for window in wv.windows:
        # ignore expired windows
        if input_watermark > window.end + self.allowed_lateness:
          continue
        windows_to_elements[window].append((wv.value, wv.timestamp))

    # First handle merging.
    if self.is_merging:
      old_windows = set(state.known_windows())
      all_windows = old_windows.union(list(windows_to_elements))

      if all_windows != old_windows:
        merged_away = {}

        class TriggerMergeContext(WindowFn.MergeContext):
          def merge(_, to_be_merged, merge_result):  # pylint: disable=no-self-argument
            for window in to_be_merged:
              if window != merge_result:
                merged_away[window] = merge_result
                # Clear state associated with PaneInfo since it is
                # not preserved across merges.
                state.clear_state(window, self.INDEX)
                state.clear_state(window, self.NONSPECULATIVE_INDEX)
            state.merge(to_be_merged, merge_result)
            # using the outer self argument.
            self.trigger_fn.on_merge(
                to_be_merged, merge_result, state.at(merge_result, self.clock))

        self.window_fn.merge(TriggerMergeContext(all_windows))

        merged_windows_to_elements = collections.defaultdict(list)
        for window, values in windows_to_elements.items():
          while window in merged_away:
            window = merged_away[window]
          merged_windows_to_elements[window].extend(values)
        windows_to_elements = merged_windows_to_elements

        for window in merged_away:
          state.clear_state(window, self.WATERMARK_HOLD)

    # Next handle element adding.
    for window, elements in windows_to_elements.items():
      if state.get_state(window, self.TOMBSTONE):
        continue
      # Add watermark hold.
      # TODO(ccy): Add late data and garbage-collection hold support.
      output_time = self.timestamp_combiner_impl.merge(
          window,
          (
              element_output_time for element_output_time in (
                  self.timestamp_combiner_impl.assign_output_time(
                      window, timestamp) for unused_value,
                  timestamp in elements)
              if element_output_time >= output_watermark))
      if output_time is not None:
        state.add_state(window, self.WATERMARK_HOLD, output_time)

      context = state.at(window, self.clock)
      for value, unused_timestamp in elements:
        state.add_state(window, self.ELEMENTS, value)
        self.trigger_fn.on_element(value, window, context)

      # Maybe fire this window.
      if self.trigger_fn.should_fire(TimeDomain.WATERMARK,
                                     input_watermark,
                                     window,
                                     context):
        finished = self.trigger_fn.on_fire(input_watermark, window, context)
        yield self._output(
            window, finished, state, input_watermark, output_watermark, False)

  def process_timer(
      self,
      window_id,
      unused_name,
      time_domain,
      timestamp,
      state,
      input_watermark=None):
    if input_watermark is None:
      input_watermark = timestamp

    if self.is_merging:
      state = MergeableStateAdapter(state)
    window = state.get_window(window_id)
    if state.get_state(window, self.TOMBSTONE):
      return

    if time_domain in (TimeDomain.WATERMARK, TimeDomain.REAL_TIME):
      if not self.is_merging or window in state.known_windows():
        context = state.at(window, self.clock)
        if self.trigger_fn.should_fire(time_domain, timestamp, window, context):
          finished = self.trigger_fn.on_fire(timestamp, window, context)
          yield self._output(
              window,
              finished,
              state,
              input_watermark,
              timestamp,
              time_domain == TimeDomain.WATERMARK)
    else:
      raise Exception('Unexpected time domain: %s' % time_domain)

  def _output(
      self,
      window,
      finished,
      state,
      input_watermark,
      output_watermark,
      maybe_ontime):
    """Output window and clean up if appropriate."""
    index = state.get_state(window, self.INDEX)
    state.add_state(window, self.INDEX, 1)
    if output_watermark <= window.max_timestamp():
      nonspeculative_index = -1
      timing = windowed_value.PaneInfoTiming.EARLY
      if state.get_state(window, self.NONSPECULATIVE_INDEX):
        nonspeculative_index = state.get_state(
            window, self.NONSPECULATIVE_INDEX)
        state.add_state(window, self.NONSPECULATIVE_INDEX, 1)
        _LOGGER.warning(
            'Watermark moved backwards in time '
            'or late data moved window end forward.')
    else:
      nonspeculative_index = state.get_state(window, self.NONSPECULATIVE_INDEX)
      state.add_state(window, self.NONSPECULATIVE_INDEX, 1)
      timing = (
          windowed_value.PaneInfoTiming.ON_TIME if maybe_ontime and
          nonspeculative_index == 0 else windowed_value.PaneInfoTiming.LATE)
    pane_info = windowed_value.PaneInfo(
        index == 0, finished, timing, index, nonspeculative_index)

    values = state.get_state(window, self.ELEMENTS)
    if finished:
      # TODO(robertwb): allowed lateness
      state.clear_state(window, self.ELEMENTS)
      state.add_state(window, self.TOMBSTONE, 1)
    elif self.accumulation_mode == AccumulationMode.DISCARDING:
      state.clear_state(window, self.ELEMENTS)

    timestamp = state.get_state(window, self.WATERMARK_HOLD)
    if timestamp is None:
      # If no watermark hold was set, output at end of window.
      timestamp = window.max_timestamp()
    elif input_watermark < window.end and self.trigger_fn.has_ontime_pane():
      # Hold the watermark in case there is an empty pane that needs to be fired
      # at the end of the window.
      pass
    else:
      state.clear_state(window, self.WATERMARK_HOLD)

    return WindowedValue(values, timestamp, (window, ), pane_info)
示例#5
0
def run():
    print("Town of Squirreliwink Bureau Of Tolls and Nuts Affair\n\n[PART-4]")

    # parse command line args:
    #   - parse both beam args and known script args
    parser = argparse.ArgumentParser(
        description="Town of Squirreliwink Bureau Of Tolls and Nuts Affair")
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        default='./data/input',
                        help='Input folder')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default='./data/output',
                        help='Output folder')
    known_args, beam_args = parser.parse_known_args(sys.argv)

    # delete previous run files
    delete_files(os.path.join(known_args.output, "report*"))

    # construct pipeline and run
    options = PipelineOptions(beam_args)
    with beam.Pipeline(options=options) as pipeline:
        # create a pcollection of nut prices
        logger.info("creating nut prices side input")
        nut_prices = (pipeline
                      | beam.Create([('cornsilk', 2.0), ('slate_gray', 3.5),
                                     ('navajo_white', 7.0)]))

        # read toll records and pass in nut prices as a side_input
        # you can convert a (k, v) tuple pcollection into a {k: v} with beam.pvalue.AsDict()
        logger.info("reading toll records")
        records = (pipeline
                   | beam.io.ReadFromText(os.path.join(known_args.input,
                                                       'tollbooth_logs.csv'),
                                          skip_header_lines=1)
                   | beam.Map(parse_csv)
                   | beam.ParDo(PrepareAndAddTotalsWithSideInput(),
                                nut_prices=beam.pvalue.AsDict(nut_prices)))

        # multi-keys multi-values combiner by using beam.combiners.TupleCombineFn()
        # first normalize rows into ((license_plate, month), (1, total, cornsilk, slate gray, navajo white, total)) tuple
        # then apply a tuple of combiners over values
        records = (records
                   | beam.Map(key_by_license_plate_month)
                   | beam.CombinePerKey(
                       beam.combiners.TupleCombineFn(combine.CountCombineFn(),
                                                     sum, sum, sum, sum,
                                                     combine.MeanCombineFn())))

        # read squirreliwink population file
        # file consist of newline delimited json rows. read each json row as dict
        logger.info("reading Squirreliwink's residents file")
        residents = (pipeline
                     | "residents" >> beam.io.ReadFromText(
                         os.path.join(known_args.input,
                                      'squirreliwink_population.json'))
                     | beam.Map(lambda line: json.loads(line)))

        # key residents by their license plate
        logger.info("key residents by license_plate")
        residents_by_plate = (
            residents | beam.Map(lambda element: (element['car'], element)))

        # lookup residents by their license plate using SideInputs
        records = (
            records
            | beam.Map(
                lambda e, lookup: (
                    # add family_name and address from resident lookup to the keys tuple.
                    # Remember e[0][0] (first value in the keys tuple) should contain our license_plate info
                    (e[0] + tuple(v for k, v in lookup[e[0][0]].items() if k in
                                  ('family_name', 'address'))),
                    e[1]),
                lookup=beam.pvalue.AsDict(residents_by_plate)
            )  # pass in residents info as a SideInput
        )

        # (records | beam.Map(print))

        # output to a newline delimited json file
        logger.info("output record into csv file")
        (records
         | beam.Map(
             lambda e: e[0] + e[1]
         )  # flatten ((keys), (values)) tuple into a single tuple (keys + values)
         | beam.Map(lambda t: dict(
             zip(  # stitch up the results as a dict, adding back column names
                 ('license_plate', 'month', 'family_name', 'address',
                  'visit_count', 'total', 'cornsilk', 'slate_gray',
                  'navajo_white', 'avg_total'), t)))
         | beam.Map(lambda d: json.dumps(d, ensure_ascii=False)
                    )  # json output the results
         | beam.io.WriteToText(os.path.join(known_args.output, "report"),
                               file_name_suffix='.json'))