def expand(self, pbegin): assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline if not self.output_tags: self.output_tags = set([None]) # For backwards compatibility return a single PCollection. if len(self.output_tags) == 1: return pvalue.PCollection( self.pipeline, is_bounded=False, tag=list(self.output_tags)[0]) return { tag: pvalue.PCollection(self.pipeline, is_bounded=False, tag=tag) for tag in self.output_tags }
def start_bundle(self): transform = self._applied_ptransform.transform self._tagged_receivers = _TaggedReceivers(self._evaluation_context) for output_tag in self._applied_ptransform.outputs: output_pcollection = pvalue.PCollection(None, tag=output_tag) output_pcollection.producer = self._applied_ptransform self._tagged_receivers[output_tag] = ( self._evaluation_context.create_bundle(output_pcollection)) self._tagged_receivers[output_tag].tag = output_tag self._counter_factory = counters.CounterFactory() # TODO(aaltay): Consider storing the serialized form as an optimization. dofn = pickler.loads(pickler.dumps(transform.dofn)) pipeline_options = self._evaluation_context.pipeline_options if (pipeline_options is not None and pipeline_options.view_as(TypeOptions).runtime_type_check): dofn = TypeCheckWrapperDoFn(dofn, transform.get_type_hints()) dofn = OutputCheckWrapperDoFn(dofn, self._applied_ptransform.full_label) self.runner = DoFnRunner( dofn, transform.args, transform.kwargs, self._side_inputs, self._applied_ptransform.inputs[0].windowing, tagged_receivers=self._tagged_receivers, step_name=self._applied_ptransform.full_label, state=DoFnState(self._counter_factory), scoped_metrics_container=self.scoped_metrics_container) self.runner.start()
def start_bundle(self): transform = self._applied_ptransform.transform self._tagged_receivers = _TaggedReceivers(self._evaluation_context) for output_tag in self._applied_ptransform.outputs: output_pcollection = pvalue.PCollection(None, tag=output_tag) output_pcollection.producer = self._applied_ptransform self._tagged_receivers[output_tag] = ( self._evaluation_context.create_bundle(output_pcollection)) self._tagged_receivers[output_tag].tag = output_tag self._counter_factory = counters.CounterFactory() # TODO(aaltay): Consider storing the serialized form as an optimization. dofn = (pickler.loads(pickler.dumps(transform.dofn)) if self._perform_dofn_pickle_test else transform.dofn) args = transform.args if hasattr(transform, 'args') else [] kwargs = transform.kwargs if hasattr(transform, 'kwargs') else {} self.runner = DoFnRunner(dofn, args, kwargs, self._side_inputs, self._applied_ptransform.inputs[0].windowing, tagged_receivers=self._tagged_receivers, step_name=self._applied_ptransform.full_label, state=DoFnState(self._counter_factory)) self.runner.start()
def expand(self, pbegin): from apache_beam.options.pipeline_options import DebugOptions from apache_beam.transforms import util assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline debug_options = self.pipeline._options.view_as(DebugOptions) if debug_options.experiments and 'beam_fn_api' in debug_options.experiments: source = self.source def split_source(unused_impulse): total_size = source.estimate_size() if total_size: # 1MB = 1 shard, 1GB = 32 shards, 1TB = 1000 shards, 1PB = 32k shards chunk_size = max(1 << 20, 1000 * int(math.sqrt(total_size))) else: chunk_size = 64 << 20 # 64mb return source.split(chunk_size) return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(split_source) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker( split.start_position, split.stop_position)))) else: # Treat Read itself as a primitive. return pvalue.PCollection(self.pipeline)
def apply_CombineValues(self, transform, pcoll): # TODO(BEAM-2937): Disable combiner lifting for fnapi. Remove this # restrictions once this feature is supported in the dataflow runner # harness. # Import here to avoid adding the dependency for local running scenarios. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.runners.dataflow.internal import apiclient if apiclient._use_fnapi(pcoll.pipeline._options): return self.apply_PTransform(transform, pcoll) return pvalue.PCollection(pcoll.pipeline)
def start_bundle(self): transform = self._applied_ptransform.transform self._tagged_receivers = _TaggedReceivers(self._evaluation_context) for output_tag in self._applied_ptransform.outputs: output_pcollection = pvalue.PCollection(None, tag=output_tag) output_pcollection.producer = self._applied_ptransform self._tagged_receivers[output_tag] = ( self._evaluation_context.create_bundle(output_pcollection)) self._tagged_receivers[output_tag].tag = output_tag self._counter_factory = counters.CounterFactory() # TODO(aaltay): Consider storing the serialized form as an optimization. dofn = ( pickler.loads(pickler.dumps(transform.dofn)) if self._perform_dofn_pickle_test else transform.dofn) args = transform.args if hasattr(transform, 'args') else [] kwargs = transform.kwargs if hasattr(transform, 'kwargs') else {} self.user_state_context = None self.user_timer_map = {} if is_stateful_dofn(dofn): kv_type_hint = self._applied_ptransform.inputs[0].element_type if kv_type_hint and kv_type_hint != Any: coder = coders.registry.get_coder(kv_type_hint) self.key_coder = coder.key_coder() else: self.key_coder = coders.registry.get_coder(Any) self.user_state_context = DirectUserStateContext( self._step_context, dofn, self.key_coder) _, all_timer_specs = get_dofn_specs(dofn) for timer_spec in all_timer_specs: self.user_timer_map['user/%s' % timer_spec.name] = timer_spec self.runner = DoFnRunner( dofn, args, kwargs, self._side_inputs, self._applied_ptransform.inputs[0].windowing, tagged_receivers=self._tagged_receivers, step_name=self._applied_ptransform.full_label, state=DoFnState(self._counter_factory), user_state_context=self.user_state_context) self.runner.setup() self.runner.start()
def apply_GroupByKey(self, transform, pcoll): # Infer coder of parent. # # TODO(ccy): make Coder inference and checking less specialized and more # comprehensive. parent = pcoll.producer if parent: coder = parent.transform._infer_output_coder() # pylint: disable=protected-access if not coder: coder = self._get_coder(pcoll.element_type or typehints.Any, None) if not coder.is_kv_coder(): raise ValueError(('Coder for the GroupByKey operation "%s" is not a ' 'key-value coder: %s.') % (transform.label, coder)) # TODO(robertwb): Update the coder itself if it changed. coders.registry.verify_deterministic( coder.key_coder(), 'GroupByKey operation "%s"' % transform.label) return pvalue.PCollection(pcoll.pipeline)
def expand(self, pbegin): from apache_beam.options.pipeline_options import DebugOptions from apache_beam.transforms import util assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline debug_options = self.pipeline._options.view_as(DebugOptions) if debug_options.experiments and 'beam_fn_api' in debug_options.experiments: NUM_SPLITS = 1000 source = self.source return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(lambda _: source.split(NUM_SPLITS)) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker(split.start_position, split. stop_position)))) else: # Treat Read itself as a primitive. return pvalue.PCollection(self.pipeline)
def expand(self, pbegin): assert isinstance(pbegin, pvalue.PBegin), ( 'Input to transform must be a PBegin but found %s' % pbegin) return pvalue.PCollection(pbegin.pipeline, is_bounded=False)
def apply_CombineValues(self, transform, pcoll): return pvalue.PCollection(pcoll.pipeline)
def expand(self, pcoll): return pvalue.PCollection(pcoll.pipeline)
def expand(self, pbegin): return pvalue.PCollection( self.pipeline, is_bounded=self.source.is_bounded())
def _clone_items(pipeline, to_clone): """Clones dependency-sorted list of PCollections and PTransforms. Returns mappings of PCollection and PTransform replacements. Args: pipeline: The beam.Pipeline. to_clone: A dependency-sorted list of PCollections and PTransforms. Returns: pcollection_replacements: a dict mapping original to cloned PCollections. Raises: ValueError: if a clone is requested of an invalid object. """ pcollection_replacements = {} ptransform_replacements = {} for item in to_clone: if isinstance(item, pvalue.PCollection): assert item not in pcollection_replacements copied = pvalue.PCollection(pipeline, tag=item.tag, element_type=item.element_type, windowing=item.windowing) copied.producer = item.producer # Update copied PCollection producer if its producer was copied as well. if copied.producer in ptransform_replacements: original_producer = copied.producer copied.producer = ptransform_replacements[original_producer] # Update producer outputs, for tag, output in original_producer.outputs.items(): if output == item: copied.producer.outputs[tag] = copied assert copied.producer.transform is not None pcollection_replacements[item] = copied elif isinstance(item, beam_pipeline.AppliedPTransform): assert item.transform is not None assert item not in ptransform_replacements # The Beam pipeline graph keeps track of composite PTransforms by having # AppliedPTransform.parts be a list of "children" AppliedPTransforms that # are part of the "parent" AppliedPTransform. Any of these "composite # wrapper" AppliedPTransforms does not actually produce output independent # of the child non-composite transform. We therefore shouldn't ever clone # AppliedPTransforms with non-empty parts, since such AppliedPTransforms # are not reachable by tracing outputs in the pipeline graph. assert not item.parts, ( 'Reached invalid composite AppliedPTransform: %r.' % item) # Assign new label. new_label_prefix = item.full_label + '.Copy' new_label = new_label_prefix next_suffix = 0 while new_label in pipeline.applied_labels: new_label = new_label_prefix + str(next_suffix) next_suffix += 1 pipeline.applied_labels.add(new_label) # Update inputs. new_inputs = [] for old_input in item.inputs: new_input = pcollection_replacements.get(old_input, old_input) new_inputs.append(new_input) new_inputs = tuple(new_inputs) # Create the copy. Note that in the copy, copied.outputs will start out # empty. Any outputs that are used will be repopulated in the PCollection # copy branch above. copied = beam_pipeline.AppliedPTransform(item.parent, item.transform, new_label, new_inputs) ptransform_replacements[item] = copied # Update composite transform parent to include this copy. # TODO(b/111366378): Reconcile the composite PTransform nesting hierarchy, # especially in the case where copied PTransforms should be copied in an # "all-or-nothing" manner. This would allow the deep copy operation to be # safe in the case runners replace well-known composite PTransforms in # their entirety during execution. copied.parent.parts.append(copied) else: raise ValueError('Invalid object to clone: %s' % item) return pcollection_replacements
def expand(self, pbegin): assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline return pvalue.PCollection(self.pipeline, is_bounded=False)
def expand(self, pcoll): return pvalue.PCollection(pcoll.pipeline, is_bounded=False)
def test_expand_method_pcollection_errors(self): with self.assertRaises(error.TransformError): self.native_write.expand(None) with self.assertRaises(error.TransformError): pcoll = pvalue.PCollection(pipeline=None) self.native_write.expand(pcoll)
def _clone_items(pipeline, to_clone): """Clones dependency-sorted list of PCollections and PTransforms. Returns mappings of PCollection and PTransform replacements. Args: pipeline: The beam.Pipeline. to_clone: A dependency-sorted list of PCollections and PTransforms. Returns: pcollection_replacements: a dict mapping original to cloned PCollections. Raises: ValueError: if a clone is requested of an invalid object. """ pcollection_replacements = {} ptransform_replacements = {} for item in to_clone: if isinstance(item, pvalue.PCollection): assert item not in pcollection_replacements copied = pvalue.PCollection(pipeline, tag=item.tag, element_type=item.element_type, windowing=item.windowing) copied.producer = item.producer # Update copied PCollection producer if its producer was copied as well. if copied.producer in ptransform_replacements: original_producer = copied.producer copied.producer = ptransform_replacements[original_producer] # Update producer outputs, for tag, output in original_producer.outputs.items(): if output == item: copied.producer.outputs[tag] = copied assert copied.producer.transform is not None pcollection_replacements[item] = copied elif isinstance(item, beam_pipeline.AppliedPTransform): assert item.transform is not None assert item not in ptransform_replacements # The Beam pipeline graph keeps track of composite PTransforms by having # AppliedPTransform.parts be a list of "children" AppliedPTransforms that # are part of the "parent" AppliedPTransform. Any of these "composite # wrapper" AppliedPTransforms does not actually produce output independent # of the child non-composite transform. We therefore shouldn't ever clone # AppliedPTransforms with non-empty parts, since such AppliedPTransforms # are not reachable by tracing outputs in the pipeline graph. assert not item.parts, ( 'Reached invalid composite AppliedPTransform: %r.' % item) # TODO(b/217271822): Implement resource hint 'close to resources' for # Beam/Dataflow, as when CSE makes it to Dataflow, 'close to resources' # cannot be recognized. Once this is fixed, we can change the tag prefix # to 'beam'. # TODO(b/238243699): Obviate the need for setting 'close to resources' # hints. close_to_resources_available = resources.ResourceHint.is_registered( 'close_to_resources') if close_to_resources_available: # Assign close_to_resources resource hint to the orginal PTransforms. # The reason of adding this annotation is to prevent root Reads that are # generated from deep copy being merged due to common subexpression # elimination (CSE). item.resource_hints['beam:resources:close_to_resources:v1'] = ( b'/fake/DeepCopy.Original[0]') # Assign new label. count = 0 copy_suffix = f'Copy{count}' new_label = f'{item.full_label}.{copy_suffix}' while new_label in pipeline.applied_labels: count += 1 copy_suffix = f'Copy{count}' new_label = f'{item.full_label}.{copy_suffix}' pipeline.applied_labels.add(new_label) # Update inputs. new_inputs = { tag: pcollection_replacements.get(old_input, old_input) for tag, old_input in item.main_inputs.items() } # Create the copy. Note that in the copy, copied.outputs will start out # empty. Any outputs that are used will be repopulated in the PCollection # copy branch above. copied = beam_pipeline.AppliedPTransform(item.parent, item.transform, new_label, new_inputs) # Add a 'close to resource' resource hint to the copied PTransforms. The # PTransforms that are generated from each deep copy have the same unique # 'close to resource' resource hint. This is to make sure that the # PTransforms that are cloned from each deep copy can be fused together, # but not across copies nor with the original. if close_to_resources_available: copied.resource_hints[ 'beam:resources:close_to_resources:v1'] = ( f'/fake/DeepCopy.{copy_suffix}[0]'.encode()) ptransform_replacements[item] = copied # Update composite transform parent to include this copy. # TODO(b/111366378): Reconcile the composite PTransform nesting hierarchy, # especially in the case where copied PTransforms should be copied in an # "all-or-nothing" manner. This would allow the deep copy operation to be # safe in the case runners replace well-known composite PTransforms in # their entirety during execution. copied.parent.parts.append(copied) else: raise ValueError('Invalid object to clone: %s' % item) return pcollection_replacements