def test_equal_to_per_window_fail_unmatched_window(self): with self.assertRaises(BeamAssertException): expected = { window.IntervalWindow(50, 100): [('k', [1])], } with TestPipeline(options=StandardOptions(streaming=True)) as p: assert_that( (p | Create([1]) | beam.WindowInto( FixedWindows(20), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()), equal_to_per_window(expected), reify_windows=True)
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--stream', type=str, help='Pub/Sub topic to read from') parser.add_argument( '--sink', help=('Output BigQuery table for windowed averages specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) records = (p | 'Read from PubSub' >> beam.io.ReadFromPubSub(topic=args.stream) | 'Parse JSON to Dict' >> beam.Map(json.loads)) """ # Write to the warehouse table records | 'Write to BigQuery' >> beam.io.WriteToBigQuery( args.sink, schema=Schema.get_warehouse_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) """ # Compute average in a sliding window and write to BQ average table (records | 'Add timestamp' >> beam.ParDo(AddTimestampToDict()) | 'Window' >> beam.WindowInto(beam.window.SlidingWindows(10, 1, offset=0)) | 'Dict to KeyValue' >> beam.ParDo(AddKeyToDict()) | 'Group by Key' >> beam.GroupByKey() | 'Average' >> beam.ParDo(CountAverages()) | 'Write Avg to BigQuery' >> beam.io.WriteToBigQuery( args.sink, schema=Schema.get_warehouse_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) result = p.run() result.wait_until_finish()
def test_setting_session_windows(self): with TestPipeline() as p: unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map( lambda x: beam.window.TimestampedValue(('k', x), x))) # [START setting_session_windows] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.Sessions(10))) # [END setting_session_windows] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) assert_that(unkeyed, equal_to([29, 27]))
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = (destination_data_kv_pc | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def test_reshuffle_global_window(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] before_reshuffle = (pipeline | beam.Create(data) | beam.WindowInto(GlobalWindows()) | beam.GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle') pipeline.run()
def test_after_count(self): p = Pipeline('DirectRunner') result = (p | beam.Create([1, 2, 3, 4, 5, 10, 11]) | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)]) | beam.Map(lambda (k, t): TimestampedValue((k, t), t)) | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3), accumulation_mode=AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v)))) assert_that(result, equal_to( { 'A-5': {1, 2, 3, 4, 5}, # A-10, A-11 never emitted due to AfterCount(3) never firing. 'B-4': {6, 7, 8, 9}, 'B-3': {10, 15, 16}, }.iteritems()))
def test_reshuffle_streaming_global_window(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as pipeline: data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] before_reshuffle = ( pipeline | beam.Create(data) | beam.WindowInto(GlobalWindows()) | beam.GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that( before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that( after_reshuffle, equal_to(expected_data), label='after reshuffle')
def test_setting_fixed_windows(self): with TestPipeline() as p: unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_fixed_windows] from apache_beam import window fixed_windowed_items = ( items | 'window' >> beam.WindowInto(window.FixedWindows(60))) # [END setting_fixed_windows] summed = (fixed_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) assert_that(unkeyed, equal_to([110, 215, 120]))
def load(events, metadata=None): return ( events | nexmark_query_util.JustBids() | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder) # windowing with processing time trigger, currently not supported in batch | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterProcessingTime(metadata.get('window_size_sec'))), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=0) | 'query12_bid_count' >> beam.combiners.Count.PerElement() | 'query12_output' >> beam.Map( lambda t: { ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1] }))
def test_setting_global_window(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 11, 16, 27]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_global_window] from apache_beam import window session_windowed_items = ( items | 'window' >> beam.WindowInto(window.GlobalWindows())) # [END setting_global_window] summed = (session_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) beam.assert_that(unkeyed, beam.equal_to([56])) p.run()
def test_setting_sliding_windows(self): p = TestPipeline() unkeyed_items = p | beam.Create([2, 16, 23]) items = (unkeyed_items | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue( ('k', x), x))) # [START setting_sliding_windows] from apache_beam import window sliding_windowed_items = ( items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5))) # [END setting_sliding_windows] summed = (sliding_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) assert_that(unkeyed, equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41])) p.run()
def side_input_slow_update( src_file_pattern, first_timestamp, last_timestamp, interval, sample_main_input_elements, main_input_windowing_interval): # [START SideInputSlowUpdateSnip1] from apache_beam.transforms.periodicsequence import PeriodicImpulse from apache_beam.transforms.window import TimestampedValue from apache_beam.transforms import window # from apache_beam.utils.timestamp import MAX_TIMESTAMP # last_timestamp = MAX_TIMESTAMP to go on indefninitely # Any user-defined function. # cross join is used as an example. def cross_join(left, rights): for x in rights: yield (left, x) # Create pipeline. pipeline_options = PipelineOptions() p = beam.Pipeline(options=pipeline_options) side_input = ( p | 'PeriodicImpulse' >> PeriodicImpulse( first_timestamp, last_timestamp, interval, True) | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x)) | 'ReadFromFile' >> beam.io.ReadAllFromText()) main_input = ( p | 'MpImpulse' >> beam.Create(sample_main_input_elements) | 'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src)) | 'WindowMpInto' >> beam.WindowInto( window.FixedWindows(main_input_windowing_interval))) result = ( main_input | 'ApplyCrossJoin' >> beam.FlatMap( cross_join, rights=beam.pvalue.AsIter(side_input))) # [END SideInputSlowUpdateSnip1] return p, result
def test_gbk_execution_no_triggers(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)]) .advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15)) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', ['late']), ], window.IntervalWindow(15, 30): [ ('k', ['d', 'e']), ], window.IntervalWindow(300, 315): [ ('k', ['last']), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub message based on its # publish timestamp. | "Window into Fixed Intervals" >> beam.WindowInto(window.FixedWindows(self.window_size)) | "Add timestamps to messages" >> beam.ParDo(AddTimestamps()) # Use a dummy key to group the elements in the same window. # Note that all the elements in one window must fit into memory # for this. If the windowed elements do not fit into memory, # please consider using `beam.util.BatchElements`. # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val) )
def run(argv=None): # Use Python argparse module to parse custom arguments parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://rim-bucket/market.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://rim-bucket/output/', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p_options = PipelineOptions(pipeline_args) google_cloud_options = p_options.view_as(GoogleCloudOptions) google_cloud_options.region = 'europe-west1' google_cloud_options.project = 'smartlive' '''google_cloud_options.job_name = 'dataflow-job-{}'.format( datetime.datetime.now().strftime("%Y-%m-%d%H%M%S") )''' google_cloud_options.staging_location = 'gs://rim-bucket/binaries' google_cloud_options.temp_location = 'gs://rim-bucket/temp' p_options.view_as(StandardOptions).runner = 'DirectRunner' p_options.view_as(SetupOptions).save_main_session = True p_options.view_as(StandardOptions).streaming = True p_options.view_as(WorkerOptions).subnetwork = ( 'regions/europe-west1/subnetworks/test') p = beam.Pipeline(options=p_options) lines = p | 'receive_data' >> beam.io.ReadFromText( known_args.input)\ | 'window' >> beam.WindowInto(window.GlobalWindows()) \ | 'jsonload' >> beam.Map(lambda x: json.loads(x))\ | 'count' >> beam.Map(lambda x: len(x))\ | 'printnbrarticles' >> beam.ParDo(PrintFn()) \ # ----- window fixe + Trigger AfterWatermark + Accumulating mode ------ # (lines | 'CountGlobally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults()) p.run().wait_until_finish()
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) example_uris = {} for example in input_dict['examples']: for split in artifact_utils.decode_split_names( example.split_names): example_uris[split] = os.path.join(example.uri, split) model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) absl.logging.info('Using {} as current model.'.format(model_path)) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict['output_data']), 'pred.csv') with self._make_beam_pipeline() as pipeline: test_data = [] for split, example_uri in example_uris.items(): test_data.append(pipeline | 'ReadFromTFRecord_{}'.format( split) >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(example_uri))) (test_data | 'Flattern' >> beam.Flatten() | 'ParseToExample' >> beam.Map(tf.train.Example.FromString) | 'Prediction' >> beam.ParDo( RunModel(model_path, 'serving_default', 'PassengerId')) | 'ParseToKVPair' >> beam.Map(lambda x: ParseResultToKV(x)) | 'AddSameKey' >> beam.Map(lambda x: (1, x)) | 'Window' >> beam.WindowInto(beam.window.GlobalWindows()) | 'GroupByKey' >> beam.GroupByKey() | 'Sort' >> beam.Map( lambda group_data: sorted(group_data[1], key=lambda x: x[0])) | 'Flatten' >> beam.FlatMap(lambda x: x) | 'ToStr' >> beam.Map( lambda x: '{},{}'.format(x[0], '0' if x[1] < 0.5 else '1')) | 'WriteToFile' >> beam.io.WriteToText( output_uri, num_shards=1, shard_name_template='', header='PassengerId,Survived')) absl.logging.info('TestPredComponent result written to %s.', output_uri)
def test_reshuffle_window_fn_preserved(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_windows = [ TestWindowedValue(v, t, [w]) for (v, t, w) in [((1, 1), 1.0, IntervalWindow(1.0, 3.0)), ((2, 1), 1.0, IntervalWindow(1.0, 3.0)), ((3, 1), 1.0, IntervalWindow(1.0, 3.0)), ((1, 2), 2.0, IntervalWindow(2.0, 4.0)), ((2, 2), 2.0, IntervalWindow(2.0, 4.0)), ((1, 4), 4.0, IntervalWindow(4.0, 6.0))] ] expected_merged_windows = [ TestWindowedValue(v, t - .001, [w]) for (v, t, w) in [((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)), ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0) ), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0) ), ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))] ] before_reshuffle = ( pipeline | 'start' >> beam.Create(data) | 'add_timestamp' >> beam.Map(lambda v: TimestampedValue(v, v[1])) | 'window' >> beam.WindowInto(Sessions(gap_size=2))) assert_that(before_reshuffle, equal_to(expected_windows), label='before_reshuffle', reify_windows=True) after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_windows), label='after_reshuffle', reify_windows=True) after_group = after_reshuffle | beam.GroupByKey() assert_that(after_group, equal_to(expected_merged_windows), label='after_group', reify_windows=True) pipeline.run()
def test_equal_to_per_window_fail_unexpected_element(self): with self.assertRaises(BeamAssertException): start = int(MIN_TIMESTAMP.micros // 1e6) - 5 end = start + 20 expected = { window.IntervalWindow(start, end): [('k', [1])], } with TestPipeline(options=StandardOptions(streaming=True)) as p: assert_that((p | Create([1, 2]) | beam.WindowInto( FixedWindows(20), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()), equal_to_per_window(expected), reify_windows=True)
def test_reshuffle_sliding_window(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] window_size = 2 expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] * window_size before_reshuffle = (pipeline | beam.Create(data) | beam.WindowInto(SlidingWindows( size=window_size, period=1)) | beam.GroupByKey()) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = before_reshuffle | beam.Reshuffle() # If Reshuffle applies the sliding window function a second time there # should be extra values for each key. assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle') pipeline.run()
def build_read_pipeline(self, pipeline): _ = (pipeline | 'ReadFromKafka' >> ReadFromKafka( consumer_config={ 'bootstrap.servers': self.bootstrap_servers, 'auto.offset.reset': 'earliest' }, topics=[self.topic], expansion_service=self.expansion_service) | 'Windowing' >> beam.WindowInto( beam.window.FixedWindows(300), trigger=beam.transforms.trigger.AfterProcessingTime(60), accumulation_mode=beam.transforms.trigger.AccumulationMode. DISCARDING) | 'DecodingValue' >> beam.Map(lambda elem: int(elem[1].decode())) | 'CombineGlobally' >> beam.CombineGlobally(sum).without_defaults() | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
def test_reshuffle_streaming_global_window(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True pipeline = TestPipeline(options=options) data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] before_reshuffle = (pipeline | 'start' >> beam.Create(data) | 'window' >> beam.WindowInto(GlobalWindows()) | 'group_by_key' >> beam.GroupByKey()) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle()) assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle') pipeline.run()
def expand(self, pcoll): events = pcoll | beam.WindowInto(self.auction_or_bid_windowFn) auction_by_id = ( events | nexmark_query_util.JustAuctions() | 'auction_by_id' >> beam.ParDo(nexmark_query_util.AuctionByIdFn())) bids_by_auction_id = ( events | nexmark_query_util.JustBids() | 'bid_by_auction' >> beam.ParDo(nexmark_query_util.BidByAuctionIdFn())) return ({ nexmark_query_util.AUCTION_TAG: auction_by_id, nexmark_query_util.BID_TAG: bids_by_auction_id } | beam.CoGroupByKey() | beam.ParDo(JoinAuctionBidFn()))
def pardo_dofn_params(test=None): # [START pardo_dofn_params] import apache_beam as beam # pylint: disable=line-too-long class AnalyzeElement(beam.DoFn): def process(self, elem, timestamp=beam.DoFn.TimestampParam, window=beam.DoFn.WindowParam): yield '\n'.join([ '# timestamp', 'type(timestamp) -> ' + repr(type(timestamp)), 'timestamp.micros -> ' + repr(timestamp.micros), 'timestamp.to_rfc3339() -> ' + repr(timestamp.to_rfc3339()), 'timestamp.to_utc_datetime() -> ' + repr(timestamp.to_utc_datetime()), '', '# window', 'type(window) -> ' + repr(type(window)), 'window.start -> {} ({})'.format( window.start, window.start.to_utc_datetime()), 'window.end -> {} ({})'.format(window.end, window.end.to_utc_datetime()), 'window.max_timestamp() -> {} ({})'.format( window.max_timestamp(), window.max_timestamp().to_utc_datetime()), ]) # pylint: enable=line-too-long with beam.Pipeline() as pipeline: dofn_params = ( pipeline | 'Create a single test element' >> beam.Create([':)']) | 'Add timestamp (Spring equinox 2020)' >> beam.Map( lambda elem: beam.window.TimestampedValue(elem, 1584675660)) | 'Fixed 30sec windows' >> beam.WindowInto( beam.window.FixedWindows(30)) | 'Analyze element' >> beam.ParDo(AnalyzeElement()) | beam.Map(print)) # [END pardo_dofn_params] if test: test(dofn_params)
def test_gbk_execution_no_triggers(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'b', 'c']) .advance_watermark_to(20) .add_elements(['d']) .add_elements(['e']) .advance_processing_time(10) .advance_watermark_to(300) .add_elements([TimestampedValue('late', 12)]) .add_elements([TimestampedValue('last', 310)])) # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def fired_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p | test_stream | beam.WindowInto(FixedWindows(15)) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey() | beam.Map(fired_elements)) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. assert_that(records, equal_to([ ('k', ['a', 'b', 'c']), ('k', ['d', 'e']), ('k', ['late']), ('k', ['last'])])) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([ ('k', ['a', 'b', 'c']), ('k', ['d', 'e']), ('k', ['late']), ('k', ['last'])], result)
def run(argv=None): from apache_beam.transforms.window import TimestampedValue, FixedWindows pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images' with beam.Pipeline(options=get_pipeline_options()) as pipeline: logging.info("pubsub_input_topic = {}".format(pubsub_input_topic)) json_messages = \ (pipeline | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes) | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message) ) window_size_s = 30 allowed_lateness_s = 60 high_confidence_faces_grouped_by_emotion_count_per_window = ( json_messages | 'ParseJsonMessage' >> beam.Map(parse_jsons) | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn()) | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood) | 'UseCustomTimestamp' >> beam.Map(lambda face_info: TimestampedValue(face_info, face_info['ts_seconds'])) | 'WindowFaceInfo' >> beam.WindowInto( FixedWindows(window_size_s, 0), trigger=AfterWatermark( early=AfterAny(AfterCount(5), AfterProcessingTime(10)), late=AfterAll(AfterCount(2), AfterProcessingTime(20))), allowed_lateness=allowed_lateness_s, accumulation_mode=AccumulationMode.DISCARDING) | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info)) | 'GroupByEmotion' >> beam.GroupByKey() | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow()) ) log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery") high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery( bq_faces_windowed_table_name, schema={"fields": bq_faces_windowed_table_schema}, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) pipeline_result = pipeline.run() pipeline_result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', default='projects/complete-rush-206308/topics/salesstream', help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument( '--input_topic', default='projects/complete-rush-206308/topics/salesstream', help=('Input PubSub topic of the form ')) parser.add_argument( '--input_subscription', default='projects/complete-rush-206308/subscriptions/salesReceiver', help=('Input PubSub subscription of the form ')) parser.add_argument('--output', dest='output', default='gs://sales_bkt/output/', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DataflowRunner', '--project=complete-rush-206308', '--staging_location=gs://sales_bkt/stg', '--temp_location=gs://sales_bkt/tmp', '--job_name=myslaesprostream', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: output = ( p #| beam.io.ReadStringsFromPubSub(subscription=known_args.input_subscription) | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic) #| beam.FlatMap(parse_record, filtered) | beam.ParDo(ParseRecordDoFn()) | beam.WindowInto(window.FixedWindows(15, 0)) | beam.CombinePerKey(sum)) output | beam.io.WriteStringsToPubSub(known_args.output_topic) #output | WriteToText(known_args.output) print start_time
def run(bootstrap_servers, topic, pipeline_args): # bootstrap_servers = '123.45.67.89:123:9092' # topic = 'kafka_taxirides_realtime' # pipeline_args = ['--project', 'my-project', # '--runner', 'DataflowRunner', # '--temp_location', 'my-temp-location', # '--region', 'my-region', # '--num_workers', 'my-num-workers', # '--experiments', 'use_runner_v2'] pipeline_options = PipelineOptions(pipeline_args, save_main_session=True, streaming=True) window_size = 15 # size of the Window in seconds. def log_ride(ride_bytes): # Converting bytes record from Kafka to a dictionary. import ast ride = ast.literal_eval(ride_bytes.decode("UTF-8")) logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers', ride['latitude'], ride['longitude'], ride['passenger_count']) with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime'). with_output_types(bytes) | beam.Map(lambda x: (b'', x)).with_output_types( typing.Tuple[bytes, bytes]) # Kafka write transforms expects KVs. | beam.WindowInto(beam.window.FixedWindows(window_size)) | WriteToKafka( producer_config={'bootstrap.servers': bootstrap_servers}, topic=topic)) _ = (pipeline | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrap_servers}, topics=[topic]) | beam.FlatMap(lambda kv: log_ride(kv[1])))
def expand(self, pcoll): ret = ( pcoll | beam.WindowInto(beam.window.GlobalWindows()) # First get the initial timing information. This will be used to start # the periodic timers which will generate processing time and watermark # advancements every `sample_resolution_sec`. | 'initial timing' >> PairWithTiming() # Next, map every element to the same key so that only a single timer is # started for this given ReverseTestStream. | 'first key' >> beam.Map(lambda x: (0, x)) # Next, pass-through each element which will be paired with its timing # info in the next step. Also, start the periodic timers. We use timers # in this situation to capture watermark advancements that occur when # there are no elements being produced upstream. | beam.ParDo( _TimingEventGenerator( output_tag=self._output_tag, sample_resolution_sec=self._sample_resolution_sec)) # Next, retrieve the timing information for watermark events that were # generated in the previous step. This is because elements generated # through the timers don't have their timing information yet. | 'timing info for watermarks' >> PairWithTiming() # Re-key to the same key to keep global state. | 'second key' >> beam.Map(lambda x: (0, x)) # Format the events properly. | beam.ParDo(_TestStreamFormatter(self._coder, self._output_format))) if self._output_format == OutputFormat.SERIALIZED_TEST_STREAM_FILE_RECORDS: def serializer(e): return e.SerializeToString() ret = ret | 'serializer' >> beam.Map(serializer) return ret
def run(): pipeline_options = PipelineOptions(streaming=True) resolution = pipeline_options.view_as(MyOptions).resolution.get() with beam.Pipeline(options=pipeline_options) as p: subscription_id = 'projects/iex-stream/subscriptions/iex-aggregate-' + str( resolution) lines = (p | beam.io.ReadFromPubSub( subscription=subscription_id).with_output_types(bytes) | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) | beam.Map(json.loads)) schema = 'symbol:STRING,latest_price:FLOAT,window_end:TIMESTAMP,event_time:TIMESTAMP,resolution_minutes:INTEGER' (lines | 'CreateWindow' >> beam.WindowInto( SlidingWindows(60 * resolution, 10, 5)) | 'AddWindowEndTimestamp' >> beam.ParDo( AddTimestamp(resolution=resolution)) | 'WriteToBigQuery' >> beam.io.WriteToBigQuery('iex.quote', schema=schema))
def load(events, metadata=None): return ( events | nexmark_query_util.JustBids() | 'query5_sliding_window' >> beam.WindowInto( window.SlidingWindows(metadata.get('window_size_sec'), metadata.get('window_period_sec'))) # project out only the auction id for each bid | 'extract_bid_auction' >> beam.Map(lambda bid: bid.auction) | 'bid_count_per_auction' >> beam.combiners.Count.PerElement() | 'bid_max_count' >> beam.CombineGlobally( MostBidCombineFn()).without_defaults() # TODO(leiyiz): fanout with sliding window produces duplicated results, # uncomment after it is fixed [BEAM-10617] # .with_fanout(metadata.get('fanout')) | beam.FlatMap(lambda auc_count: [{ ResultNames.AUCTION_ID: auction, ResultNames.NUM: auc_count[1] } for auction in auc_count[0]]))