def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None]) init_result_coll = do_once | 'InitializeWrite' >> core.Map( lambda _, sink: sink.initialize_write(), self.sink) if getattr(self.sink, 'num_shards', 0): min_shards = self.sink.num_shards if min_shards == 1: keyed_pcoll = pcoll | core.Map(lambda x: (None, x)) else: keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards)) write_result_coll = ( keyed_pcoll | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink), AsSingleton(init_result_coll))) else: min_shards = 1 write_result_coll = ( pcoll | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink), AsSingleton(init_result_coll)) | 'Pair' >> core.Map(lambda x: (None, x)) | core.WindowInto(window.GlobalWindows()) | core.GroupByKey() | 'Extract' >> core.FlatMap(lambda x: x[1])) return do_once | 'FinalizeWrite' >> core.FlatMap( _finalize_write, self.sink, AsSingleton(init_result_coll), AsIter(write_result_coll), min_shards)
def expand(self, pbegin): from apache_beam.options.pipeline_options import DebugOptions from apache_beam.transforms import util assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline debug_options = self.pipeline._options.view_as(DebugOptions) if debug_options.experiments and 'beam_fn_api' in debug_options.experiments: source = self.source def split_source(unused_impulse): total_size = source.estimate_size() if total_size: # 1MB = 1 shard, 1GB = 32 shards, 1TB = 1000 shards, 1PB = 32k shards chunk_size = max(1 << 20, 1000 * int(math.sqrt(total_size))) else: chunk_size = 64 << 20 # 64mb return source.split(chunk_size) return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(split_source) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker( split.start_position, split.stop_position)))) else: # Treat Read itself as a primitive. return pvalue.PCollection(self.pipeline)
def expand(self, pbegin): return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(lambda _: source.split( Read.get_desired_chunk_size(source.estimate_size()))) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker( split.start_position, split.stop_position))))
def expand(self, pcoll): do_once = pcoll.pipeline | 'DoOnceSuccess' >> core.Create([None]) main_write_result = pcoll | 'MainWrite' >> Write(self.sink) return (do_once | 'SuccessWrite' >> core.FlatMap( self._success_write, pvalue.AsIter(main_write_result)))
def expand(self, pbegin): from apache_beam.options.pipeline_options import DebugOptions from apache_beam.transforms import util assert isinstance(pbegin, pvalue.PBegin) self.pipeline = pbegin.pipeline debug_options = self.pipeline._options.view_as(DebugOptions) if debug_options.experiments and 'beam_fn_api' in debug_options.experiments: NUM_SPLITS = 1000 source = self.source return ( pbegin | core.Impulse() | 'Split' >> core.FlatMap(lambda _: source.split(NUM_SPLITS)) | util.Reshuffle() | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read( split.source.get_range_tracker(split.start_position, split. stop_position)))) else: # Treat Read itself as a primitive. return pvalue.PCollection(self.pipeline)