示例#1
0
 def monitoring_infos(self, transform_id):
   # type: (str) -> Dict[FrozenSet, metrics_pb2.MonitoringInfo]
   with self.lock:
     infos = super(SdfProcessSizedElements,
                   self).monitoring_infos(transform_id)
     current_element_progress = self.current_element_progress()
     if current_element_progress:
       if current_element_progress.completed_work:
         completed = current_element_progress.completed_work
         remaining = current_element_progress.remaining_work
       else:
         completed = current_element_progress.fraction_completed
         remaining = current_element_progress.fraction_remaining
       assert completed is not None
       assert remaining is not None
       completed_mi = metrics_pb2.MonitoringInfo(
           urn=monitoring_infos.WORK_COMPLETED_URN,
           type=monitoring_infos.LATEST_DOUBLES_TYPE,
           labels=monitoring_infos.create_labels(ptransform=transform_id),
           payload=coders.FloatCoder().get_impl().encode_nested(completed),
           timestamp=monitoring_infos.to_timestamp_proto(time.time()))
       remaining_mi = metrics_pb2.MonitoringInfo(
           urn=monitoring_infos.WORK_REMAINING_URN,
           type=monitoring_infos.LATEST_DOUBLES_TYPE,
           labels=monitoring_infos.create_labels(ptransform=transform_id),
           payload=coders.FloatCoder().get_impl().encode_nested(remaining),
           timestamp=monitoring_infos.to_timestamp_proto(time.time()))
       infos[monitoring_infos.to_key(completed_mi)] = completed_mi
       infos[monitoring_infos.to_key(remaining_mi)] = remaining_mi
   return infos
 def test_serialization(self):
     context = pipeline_context.PipelineContext()
     float_coder_ref = context.coders.get_id(coders.FloatCoder())
     bytes_coder_ref = context.coders.get_id(coders.BytesCoder())
     proto = context.to_runner_api()
     context2 = pipeline_context.PipelineContext.from_runner_api(proto)
     self.assertEqual(coders.FloatCoder(),
                      context2.coders.get_by_id(float_coder_ref))
     self.assertEqual(coders.BytesCoder(),
                      context2.coders.get_by_id(bytes_coder_ref))
示例#3
0
  def test_common_id_assignment(self):
    context = pipeline_context.PipelineContext()
    float_coder_ref = context.coders.get_id(coders.FloatCoder())
    bytes_coder_ref = context.coders.get_id(coders.BytesCoder())
    context2 = pipeline_context.PipelineContext(
        component_id_map=context.component_id_map)

    bytes_coder_ref2 = context2.coders.get_id(coders.BytesCoder())
    float_coder_ref2 = context2.coders.get_id(coders.FloatCoder())

    self.assertEqual(bytes_coder_ref, bytes_coder_ref2)
    self.assertEqual(float_coder_ref, float_coder_ref2)
def expand_sdf(stages, context):
  """Transforms splitable DoFns into pair+split+read."""
  for stage in stages:
    assert len(stage.transforms) == 1
    transform = stage.transforms[0]
    if transform.spec.urn == common_urns.primitives.PAR_DO.urn:

      pardo_payload = proto_utils.parse_Bytes(
          transform.spec.payload, beam_runner_api_pb2.ParDoPayload)

      if pardo_payload.splittable:

        def copy_like(protos, original, suffix='_copy', **kwargs):
          if isinstance(original, (str, unicode)):
            key = original
            original = protos[original]
          else:
            key = 'component'
          new_id = unique_name(protos, key + suffix)
          protos[new_id].CopyFrom(original)
          proto = protos[new_id]
          for name, value in kwargs.items():
            if isinstance(value, dict):
              getattr(proto, name).clear()
              getattr(proto, name).update(value)
            elif isinstance(value, list):
              del getattr(proto, name)[:]
              getattr(proto, name).extend(value)
            elif name == 'urn':
              proto.spec.urn = value
            else:
              setattr(proto, name, value)
          return new_id

        def make_stage(base_stage, transform_id, extra_must_follow=()):
          transform = context.components.transforms[transform_id]
          return Stage(
              transform.unique_name,
              [transform],
              base_stage.downstream_side_inputs,
              union(base_stage.must_follow, frozenset(extra_must_follow)),
              parent=base_stage,
              environment=base_stage.environment)

        main_input_tag = only_element(tag for tag in transform.inputs.keys()
                                      if tag not in pardo_payload.side_inputs)
        main_input_id = transform.inputs[main_input_tag]
        element_coder_id = context.components.pcollections[
            main_input_id].coder_id
        # KV[element, restriction]
        paired_coder_id = context.add_or_get_coder_id(
            beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.SdkFunctionSpec(
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.coders.KV.urn)),
                component_coder_ids=[element_coder_id,
                                     pardo_payload.restriction_coder_id]))
        # KV[KV[element, restriction], double]
        sized_coder_id = context.add_or_get_coder_id(
            beam_runner_api_pb2.Coder(
                spec=beam_runner_api_pb2.SdkFunctionSpec(
                    spec=beam_runner_api_pb2.FunctionSpec(
                        urn=common_urns.coders.KV.urn)),
                component_coder_ids=[
                    paired_coder_id,
                    context.add_or_get_coder_id(
                        coders.FloatCoder().to_runner_api(None),
                        'doubles_coder')
                ]))

        paired_pcoll_id = copy_like(
            context.components.pcollections,
            main_input_id,
            '_paired',
            coder_id=paired_coder_id)
        pair_transform_id = copy_like(
            context.components.transforms,
            transform,
            unique_name=transform.unique_name + '/PairWithRestriction',
            urn=common_urns.sdf_components.PAIR_WITH_RESTRICTION.urn,
            outputs={'out': paired_pcoll_id})

        split_pcoll_id = copy_like(
            context.components.pcollections,
            main_input_id,
            '_split',
            coder_id=sized_coder_id)
        split_transform_id = copy_like(
            context.components.transforms,
            transform,
            unique_name=transform.unique_name + '/SplitAndSizeRestriction',
            urn=common_urns.sdf_components.SPLIT_AND_SIZE_RESTRICTIONS.urn,
            inputs=dict(transform.inputs, **{main_input_tag: paired_pcoll_id}),
            outputs={'out': split_pcoll_id})

        process_transform_id = copy_like(
            context.components.transforms,
            transform,
            unique_name=transform.unique_name + '/Process',
            urn=
            common_urns.sdf_components.PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS
            .urn,
            inputs=dict(transform.inputs, **{main_input_tag: split_pcoll_id}))

        yield make_stage(stage, pair_transform_id)
        split_stage = make_stage(stage, split_transform_id)
        yield split_stage
        yield make_stage(
            stage, process_transform_id, extra_must_follow=[split_stage])

      else:
        yield stage

    else:
      yield stage
示例#5
0
 def encode_progress(value):
   # type: (float) -> bytes
   coder = coders.IterableCoder(coders.FloatCoder())
   return coder.encode([value])