def expand(self, pbegin) -> beam.PCollection[filesystem.FileMetadata]: # invoke periodic impulse impulse = pbegin | PeriodicImpulse(start_timestamp=self.start_ts, stop_timestamp=self.stop_ts, fire_interval=self.interval) # match file pattern periodically match_files = ( impulse | 'GetFilePattern' >> beam.Map(lambda x: self.file_pattern) | MatchAll()) # apply deduplication strategy if required if self.has_deduplication: # Making a Key Value so each file has its own state. match_files = match_files | 'ToKV' >> beam.Map(lambda x: (x.path, x)) if self.match_upd: match_files = match_files | 'RemoveOldAlreadyRead' >> beam.ParDo( _RemoveOldDuplicates()) else: match_files = match_files | 'RemoveAlreadyRead' >> beam.ParDo( _RemoveDuplicates()) # apply windowing if required. Apply at last because deduplication relies on # the global window. if self.apply_windowing: match_files = match_files | beam.WindowInto( FixedWindows(self.interval)) return match_files
def side_input_slow_update( src_file_pattern, first_timestamp, last_timestamp, interval, sample_main_input_elements, main_input_windowing_interval): # [START SideInputSlowUpdateSnip1] from apache_beam.transforms.periodicsequence import PeriodicImpulse from apache_beam.transforms.window import TimestampedValue from apache_beam.transforms import window # from apache_beam.utils.timestamp import MAX_TIMESTAMP # last_timestamp = MAX_TIMESTAMP to go on indefninitely # Any user-defined function. # cross join is used as an example. def cross_join(left, rights): for x in rights: yield (left, x) # Create pipeline. pipeline_options = PipelineOptions() p = beam.Pipeline(options=pipeline_options) side_input = ( p | 'PeriodicImpulse' >> PeriodicImpulse( first_timestamp, last_timestamp, interval, True) | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x)) | 'ReadFromFile' >> beam.io.ReadAllFromText()) main_input = ( p | 'MpImpulse' >> beam.Create(sample_main_input_elements) | 'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src)) | 'WindowMpInto' >> beam.WindowInto( window.FixedWindows(main_input_windowing_interval))) result = ( main_input | 'ApplyCrossJoin' >> beam.FlatMap( cross_join, rights=beam.pvalue.AsIter(side_input))) # [END SideInputSlowUpdateSnip1] return p, result
def test_periodicimpulse_default_start(self): default_parameters = inspect.signature(PeriodicImpulse).parameters it = default_parameters["start_timestamp"].default duration = 1 et = it + duration interval = 0.5 # Check default `stop_timestamp` is the same type `start_timestamp` is_same_type = isinstance( it, type(default_parameters["stop_timestamp"].default)) error = "'start_timestamp' and 'stop_timestamp' have different type" assert is_same_type, error with TestPipeline() as p: result = p | 'PeriodicImpulse' >> PeriodicImpulse(it, et, interval) k = [it + x * interval for x in range(0, int(duration / interval))] assert_that(result, equal_to(k))
def expand(self, pcoll): impulse = pcoll | PeriodicImpulse(start_timestamp=self.start_ts, stop_timestamp=self.stop_ts, fire_interval=self.interval) match_files = ( impulse | 'GetFilePattern' >> beam.Map(lambda x: self.file_pattern) | MatchAll()) if self.has_deduplication: match_files = ( match_files # Making a Key Value so each file has its own state. | 'ToKV' >> beam.Map(lambda x: (x.path, x)) | 'RemoveAlreadyRead' >> beam.ParDo(_RemoveDuplicates())) return match_files
def test_periodicimpulse_windowing_on_si(self): start_offset = -15 it = time.time() + start_offset duration = 15 et = it + duration interval = 5 with TestPipeline() as p: si = (p | 'PeriodicImpulse' >> PeriodicImpulse(it, et, interval, True) | 'AddKey' >> beam.Map(lambda v: ('key', v)) | 'GBK' >> beam.GroupByKey() | 'SortGBK' >> beam.MapTuple(lambda k, vs: (k, sorted(vs)))) actual = si k = [('key', [it + x * interval]) for x in range(0, int(duration / interval), 1)] assert_that(actual, equal_to(k))