def load(events, metadata=None, pipeline_options=None): return ( events | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn()) # trigger fires when each sub-triger (executed in order) fires # repeatedly 1. after at least maxLogEvents in pane # 2. or finally when watermark pass the end of window # Repeatedly 1. after at least maxLogEvents in pane # 2. or processing time pass the first element in pane + delay | 'query10_fix_window' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), trigger=trigger.AfterEach( trigger.OrFinally( trigger.Repeatedly( trigger.AfterCount(metadata.get('max_log_events'))), trigger.AfterWatermark()), trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(metadata.get('max_log_events')), trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))), accumulation_mode=trigger.AccumulationMode.DISCARDING, # Use a 1 day allowed lateness so that any forgotten hold will stall # the pipeline for that period and be very noticeable. allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk' >> beam.GroupByKey() | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options) | 'query10_window_log_files' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk_2' >> beam.GroupByKey() | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. p = TestPipeline(additional_pipeline_args=[ '--experiments=' + 'passthrough_pcollection_output_ids' ]) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_different_fixed_windows(self): self.run_windowed_side_inputs([1, 2, 11, 21, 31], window.FixedWindows(10), window.FixedWindows(20), expected=[(1, [1, 2, 11]), (2, [1, 2, 11]), (11, [1, 2, 11]), (21, [21, 31]), (31, [21, 31])])
def test_pardo_side_inputs(self): def cross_product(elem, sides): for side in sides: yield elem, side with self.create_pipeline() as p: main = p | 'main' >> beam.Create(['a', 'b', 'c']) side = p | 'side' >> beam.Create(['x', 'y']) assert_that(main | beam.FlatMap(cross_product, beam.pvalue.AsList(side)), equal_to([('a', 'x'), ('b', 'x'), ('c', 'x'), ('a', 'y'), ('b', 'y'), ('c', 'y')])) # Now with some windowing. pcoll = p | beam.Create(range(10)) | beam.Map( lambda t: window.TimestampedValue(t, t)) # Intentionally choosing non-aligned windows to highlight the transition. main = pcoll | 'WindowMain' >> beam.WindowInto(window.FixedWindows(5)) side = pcoll | 'WindowSide' >> beam.WindowInto(window.FixedWindows(7)) res = main | beam.Map(lambda x, s: (x, sorted(s)), beam.pvalue.AsList(side)) assert_that( res, equal_to([ # The window [0, 5) maps to the window [0, 7). (0, range(7)), (1, range(7)), (2, range(7)), (3, range(7)), (4, range(7)), # The window [5, 10) maps to the window [7, 14). (5, range(7, 10)), (6, range(7, 10)), (7, range(7, 10)), (8, range(7, 10)), (9, range(7, 10))]), label='windowed')
def test_pardo_windowed_side_inputs(self): with self.create_pipeline() as p: # Now with some windowing. pcoll = p | beam.Create(list( range(10))) | beam.Map(lambda t: window.TimestampedValue(t, t)) # Intentionally choosing non-aligned windows to highlight the transition. main = pcoll | 'WindowMain' >> beam.WindowInto( window.FixedWindows(5)) side = pcoll | 'WindowSide' >> beam.WindowInto( window.FixedWindows(7)) res = main | beam.Map(lambda x, s: (x, sorted(s)), beam.pvalue.AsList(side)) assert_that( res, equal_to([ # The window [0, 5) maps to the window [0, 7). (0, list(range(7))), (1, list(range(7))), (2, list(range(7))), (3, list(range(7))), (4, list(range(7))), # The window [5, 10) maps to the window [7, 14). (5, list(range(7, 10))), (6, list(range(7, 10))), (7, list(range(7, 10))), (8, list(range(7, 10))), (9, list(range(7, 10))) ]), label='windowed')
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" options = StandardOptions(streaming=True) p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_basic_execution_sideinputs_fixed_windows(self): options = PipelineOptions() options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) test_stream = (p | TestStream() .advance_watermark_to(12, tag='side') .add_elements([window.TimestampedValue('s1', 10)], tag='side') .advance_watermark_to(20, tag='side') .add_elements([window.TimestampedValue('s2', 20)], tag='side') .advance_watermark_to(9, tag='main') .add_elements(['a1', 'a2', 'a3', 'a4'], tag='main') .add_elements(['b'], tag='main') .advance_watermark_to(18, tag='main') .add_elements('c', tag='main') ) # yapf: disable main_stream = ( test_stream['main'] | 'main windowInto' >> beam.WindowInto(window.FixedWindows(1))) side_stream = ( test_stream['side'] | 'side windowInto' >> beam.WindowInto(window.FixedWindows(3))) class RecordFn(beam.DoFn): def process( self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = ( main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream))) # assert per window expected_window_to_elements = { window.IntervalWindow(9, 10): [ ('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']), ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']), ('b', Timestamp(9), ['s1']) ], window.IntervalWindow(18, 19): [('c', Timestamp(18), ['s2'])], } assert_that( records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_basic_execution_sideinputs_fixed_windows(self): # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def recorded_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) main_stream = (p | 'main TestStream' >> TestStream() .advance_watermark_to(9) .add_elements(['a1', 'a2', 'a3', 'a4']) .add_elements(['b']) .advance_watermark_to(18) .add_elements('c') | 'main windowInto' >> beam.WindowInto( window.FixedWindows(1)) ) side_stream = (p | 'side TestStream' >> TestStream() .advance_watermark_to(12) .add_elements([window.TimestampedValue('s1', 10)]) .advance_watermark_to(20) .add_elements([window.TimestampedValue('s2', 20)]) | 'side windowInto' >> beam.WindowInto( window.FixedWindows(3)) ) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_stream # pylint: disable=unused-variable | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)) | beam.Map(recorded_elements)) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('a1', Timestamp(9), ['s1']), ('a2', Timestamp(9), ['s1']), ('a3', Timestamp(9), ['s1']), ('a4', Timestamp(9), ['s1']), ('b', Timestamp(9), ['s1']), ('c', Timestamp(18), ['s2'])], result)
def run(argv=None): """Build and run the pipeline""" parser = argparse.ArgumentParser() parser.add_argument("--topic", type=str, help='Pub/Sub topic to read from') parser.add_argument("--output_bucket", help=('Output local filemane')) parser.add_argument('--output_bigquery', default='IoTData.engine', help=('Output BigQuery table: ' 'PROJECT:DATASET.TABLE ' 'or DATASET.TABLE.')) parser.add_argument('--output_bigquery_avg', default='DeviceData.engine_avr', help=('Output BigQuery table for averages: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) pubsub_stream = ( p | 'Read from PubSub' >> beam.io.ReadFromPubSub(topic=args.topic)) records = (pubsub_stream | 'Parse JSON to Dict' >> beam.Map(lambda e: json.loads(e)) | 'Add timestamp' >> beam.ParDo(AddTimestampToDict())) # stream to BigQuery (records | 'Write to BigQuery' >> beam.io.WriteToBigQuery( args.output_bigquery, schema=Schema.get_bigquery_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) # averages (records | "Window for avg" >> beam.WindowInto(window.FixedWindows(60)) | 'Add deviceId Key' >> beam.ParDo(AddKeyToDict()) | 'Group by Key' >> beam.GroupByKey() | 'Count average' >> beam.ParDo(CountAverages()) | 'Write Avg to BigQuery' >> beam.io.WriteToBigQuery( args.output_bigquery_avg, schema=Schema.get_bigquery_avg_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) (records | "Window for bucket" >> beam.WindowInto(window.FixedWindows(60)) | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Group by Dummy Key" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val) | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(args.output_bucket))) result = p.run() result.wait_until_finish()
def test_windowed_singleton(self): self.run_windowed_side_inputs( [1, 2, 11], window.FixedWindows(10), side_input_type=beam.pvalue.AsSingleton, combine_fn=sum, expected=[(1, 3), (2, 3), (11, 11)])
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input_topic', required=True, help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) known_args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read from PubSub into a PCollection. lines = p | beam.io.ReadStringsFromPubSub(known_args.input_topic) # Capitalize the characters in each line. transformed = ( lines # Use a pre-defined function that imports the re package. | 'Split' >> (beam.FlatMap(split_fn).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'Group' >> beam.GroupByKey() | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup)) # Write to PubSub. # pylint: disable=expression-not-assigned transformed | beam.io.WriteStringsToPubSub(known_args.output_topic)
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True offer_stat_pipeline_options = pipeline_options.view_as( OfferStatPipelineOptions) p = beam.Pipeline(options=pipeline_options) p | "Read account offer from PS" >> beam.io.ReadFromPubSub(topic=offer_stat_pipeline_options.account_offers_topic) \ | "Parse message" >> beam.ParDo(PubsubMessageParser()) \ | "Windowing" >> beam.WindowInto(window.FixedWindows(60), trigger=trigger.AfterWatermark(early=trigger.AfterProcessingTime(20)), accumulation_mode=AccumulationMode.ACCUMULATING) \ | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer)) \ | beam.GroupByKey() \ | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount()) \ | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()) \ | 'Writing offers to BQ' >> beam.io.WriteToBigQuery(table=offer_stat_pipeline_options.offer_stat_bq_table, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND, schema=OFFER_STAT_BQ_SCHEMA) result = p.run() result.wait_until_finish()
def main_without_pubsub(options): from rillbeam.transforms import SleepFn with beam.Pipeline(options=options) as pipe: # FIXME: still can't "fake" timestamp data like we get from pubsub... graph = ( pipe | 'start' >> beam.Create([(k, k) for k in range(5)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> beam.WindowInto(window.FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | beam.Map(lambda x_t2: window.TimestampedValue(x_t2[0], x_t2[1]))) b1 = (graph | 'AsInt' >> beam.Map(lambda x: int(x)) | 'LogInt' >> Log()) b2 = (graph | 'AsStr' >> beam.Map(lambda x: str(x)) | 'LogStr' >> Log()) b3 = (b1 | 'Sleep' >> beam.ParDo(SleepFn(), duration=0.2) | 'AsFloat' >> beam.Map(lambda x: float(x)) | 'LogFloat' >> Log()) ((b1, b2, b3) | Sync() | 'SyncLog' >> Log())
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', required=True, help='Input PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') parser.add_argument( '--output_topic', required=True, help='Output PubSub topic of the form "/topics/<PROJECT>/<TOPIC>".') known_args, pipeline_args = parser.parse_known_args(argv) p = beam.Pipeline(argv=pipeline_args) # Read the text file[pattern] into a PCollection. lines = p | beam.io.Read( 'read', beam.io.PubSubSource(known_args.input_topic)) # Capitalize the characters in each line. transformed = (lines | 'Split' >> ( beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) .with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'Group' >> beam.GroupByKey() | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup)) # Write to PubSub. # pylint: disable=expression-not-assigned transformed | beam.io.Write( 'pubsub_write', beam.io.PubSubSink(known_args.output_topic)) p.run().wait_until_finish()
def test_timer_output_timestamp_and_window(self): class TimerEmittingStatefulDoFn(DoFn): EMIT_TIMER_1 = TimerSpec('emit1', TimeDomain.WATERMARK) def process(self, element, timer1=DoFn.TimerParam(EMIT_TIMER_1)): timer1.set(10) @on_timer(EMIT_TIMER_1) def emit_callback_1(self, window=DoFn.WindowParam, ts=DoFn.TimestampParam, key=DoFn.KeyParam): yield ('timer1-{key}'.format(key=key), int(ts), int(window.start), int(window.end)) pipeline_options = PipelineOptions() with TestPipeline(options=pipeline_options) as p: test_stream = (TestStream().advance_watermark_to(10).add_elements( [1])) (p | test_stream | beam.Map(lambda x: ('mykey', x)) | "window_into" >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.ParDo(TimerEmittingStatefulDoFn()) | beam.ParDo(self.record_dofn())) self.assertEqual([('timer1-mykey', 10, 10, 15)], sorted(StatefulDoFnOnDirectRunnerTest.all_records))
def test_stateful_set_state_clean_portably(self): class SetStateClearingStatefulDoFn(beam.DoFn): SET_STATE = SetStateSpec('buffer', VarIntCoder()) EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK) def process(self, element, set_state=beam.DoFn.StateParam(SET_STATE), emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)): _, value = element set_state.add(value) all_elements = [element for element in set_state.read()] if len(all_elements) == 5: set_state.clear() set_state.add(100) emit_timer.set(1) @on_timer(EMIT_TIMER) def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)): yield sorted(set_state.read()) with TestPipeline() as p: values = p | beam.Create([('key', 1), ('key', 2), ('key', 3), ('key', 4), ('key', 5)]) actual_values = (values | beam.Map(lambda t: window.TimestampedValue(t, 1)) | beam.WindowInto(window.FixedWindows(1)) | beam.ParDo(SetStateClearingStatefulDoFn())) assert_that(actual_values, equal_to([[100]]))
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input Pub/Sub subscription to read from.') parser.add_argument('--output', dest='output', required=True, help='Output BigQuery table to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. (p | 'read' >> ReadFromPubSub(subscription=known_args.input) | 'extract words' >> beam.FlatMap(extract_words) | 'transform to kv' >> beam.Map(lambda x: (x,1)) | 'window per minute' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterProcessingTime(delay=10), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group by words' >> beam.GroupByKey() | 'count ones' >> beam.Map(count_ones) | 'format for bq' >> beam.Map(format_for_bigquery) | 'write to bigquery' >> WriteToBigQuery(table=known_args.output)) result = p.run() result.wait_until_finish()
def expand(self, p): return (p | 'window' >> beam.WindowInto(window.FixedWindows(self.duration)) | 'filter_spammers' >> beam.ParDo(FilterSpammers(), spammers=self.spammers) | 'extract_team_score' >> ExtractAndSumScore('team'))
def test_combiner_latest(self): """Test TimestampCombiner with LATEST.""" options = PipelineOptions(streaming=True) with TestPipeline(options=options) as p: result = ( p | TestStream().add_elements([ window.TimestampedValue(('k', 100), 2) ]).add_elements([window.TimestampedValue( ('k', 400), 7)]).advance_watermark_to_infinity() | beam.WindowInto( window.FixedWindows(10), timestamp_combiner=TimestampCombiner.OUTPUT_AT_LATEST) | beam.CombinePerKey(sum)) records = ( result | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # All the KV pairs are applied GBK using LATEST timestamp for # the same key. expected_window_to_elements = { window.IntervalWindow(0, 10): [ (('k', 500), Timestamp(7)), ], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window')
def test_fixed_global_window(self): self.run_windowed_side_inputs([1, 2, 11], window.FixedWindows(10), window.GlobalWindows(), expected=[(1, [1, 2, 11]), (2, [1, 2, 11]), (11, [1, 2, 11])])
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub message based on its publish timestamp. | "window_into" >> beam.WindowInto( window.FixedWindows(self.window_size)) | "parse_message" >> beam.Map(ProcessMessages.transform))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_subscription', required=True, help= 'Input PubSub subscription of the form "projects/<project>/subscriptions/<subscription_name>".' ) parser.add_argument( '--output_table', required=True, help= ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE ' 'or DATASET.TABLE.')) known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: # Read the text from PubSub messages. lines = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) transformed = (lines | 'Split' >> (beam.FlatMap(find_msg)) | 'window' >> beam.WindowInto(window.FixedWindows(60)) | 'append' >> beam.CombineGlobally( ToListCombineFn()).without_defaults() | 'Format' >> beam.ParDo(FormDoFn())) transformed | 'Write' >> beam.io.WriteToBigQuery( known_args.output_table)
def test_serialize_windowing_strategy(self): # This just tests the basic path; more complete tests # are in window_test.py. strategy = Windowing(window.FixedWindows(10)) self.assertEqual( strategy, DataflowRunner.deserialize_windowing_strategy( DataflowRunner.serialize_windowing_strategy(strategy, None)))
def apply_window(self, data): """Function used to apply the window to the data. Currently this is FIXED since OHLCV data is calculated using a fixed window. :param data: PCollection being processed :return: PCollection with applied window depending on window size. """ return data | 'Resampler - Divide data to windows' >> beam.WindowInto( window.FixedWindows(self.window_size))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = ( lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(six.text_type)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteToPubSub(known_args.output_topic) result = p.run() result.wait_until_finish()
def run(argv=None, save_main_session=True): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=( 'Output PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) parser.add_argument( '--input_topic', required=True, help=( 'Input PubSub subscription of the form ' '"projects/<PROJECT>/topics/<TOPIC>."')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: messages = ( p | beam.io.ReadFromPubSub(topic=known_args.input_topic). with_output_types(bytes)) lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = ( lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = ( counts | 'format' >> beam.Map(format_result) | 'encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)) # Write to PubSub. output | beam.io.WriteToPubSub(known_args.output_topic)
def expand(self, pcoll): return ( pcoll | 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, time.time())) | "Window into Fixed Intervals" >> beam.WindowInto( window.FixedWindows(self.window_size)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output', required=True, help=('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE ' 'or DATASET.TABLE.')) parser.add_argument( '--input_subscription', required=True, help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. # messages = (p # | beam.io.ReadFromPubSub( # subscription=known_args.input_subscription) # .with_output_types(bytes)) messages = (p | beam.io.ReadFromText(messages_path)) lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) tweets = lines | 'extract tweets' >> (beam.ParDo(JSONToTweetDoFn())) tweets_with_ts = tweets | 'set timestamp' >> beam.ParDo(AddTimestampFn()) # records = tweets | 'tweets to records' >> (beam.Map(tweet_to_bqrecord.tweet_to_bqrecord)) def count(element): (w, ones) = element print (w, sum(ones)) return (w, sum(ones)) languages = (tweets_with_ts | 'extract language' >> (beam.Map(lambda x: (x.language, 1))) | beam.WindowInto(window.FixedWindows(1, 0)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count) ) # records | 'write' >> beam.io.Write( # beam.io.BigQuerySink( # known_args.output, # schema=tweet_schema.table_schema, # create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, # write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) result = p.run() result.wait_until_finish()
def _run_pardo_state_timers(self, windowed): state_spec = userstate.BagStateSpec('state', beam.coders.StrUtf8Coder()) timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK) elements = list('abcdefgh') buffer_size = 3 class BufferDoFn(beam.DoFn): def process(self, kv, ts=beam.DoFn.TimestampParam, timer=beam.DoFn.TimerParam(timer_spec), state=beam.DoFn.StateParam(state_spec)): _, element = kv state.add(element) buffer = state.read() # For real use, we'd keep track of this size separately. if len(list(buffer)) >= 3: state.clear() yield buffer else: timer.set(ts + 1) @userstate.on_timer(timer_spec) def process_timer(self, state=beam.DoFn.StateParam(state_spec)): buffer = state.read() state.clear() yield buffer def is_buffered_correctly(actual): # Pickling self in the closure for asserts gives errors (only on jenkins). self = FnApiRunnerTest('__init__') # Acutal should be a grouping of the inputs into batches of size # at most buffer_size, but the actual batching is nondeterministic # based on ordering and trigger firing timing. self.assertEqual(sorted(sum((list(b) for b in actual), [])), elements) self.assertEqual(max(len(list(buffer)) for buffer in actual), buffer_size) if windowed: # Elements were assigned to windows based on their parity. # Assert that each grouping consists of elements belonging to the # same window to ensure states and timers were properly partitioned. for b in actual: parity = set(ord(e) % 2 for e in b) self.assertEqual(1, len(parity), b) with self.create_pipeline() as p: actual = ( p | beam.Create(elements) # Send even and odd elements to different windows. | beam.Map(lambda e: window.TimestampedValue(e, ord(e) % 2)) | beam.WindowInto(window.FixedWindows(1) if windowed else window.GlobalWindows()) | beam.Map(lambda x: ('key', x)) | beam.ParDo(BufferDoFn())) assert_that(actual, is_buffered_correctly)