def expand(self, pvalue): return (pvalue | 'ExpandIntoRanges' >> ParDo(_ExpandIntoRanges( self._splittable, self._compression_type, self._desired_bundle_size, self._min_bundle_size)) | 'Reshard' >> Reshuffle() | 'ReadRange' >> ParDo(_ReadRange(self._source_from_file)))
def expand(self, pcoll): class ReifyTimestamps(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield element[0], TimestampedValue(element[1], timestamp) class RestoreTimestamps(DoFn): def process(self, element, window=DoFn.WindowParam): # Pass the current window since _IdentityWindowFn wouldn't know how # to generate it. yield windowed_value.WindowedValue( (element[0], element[1].value), element[1].timestamp, [window]) windowing_saved = pcoll.windowing # The linter is confused. # pylint: disable=abstract-class-instantiated result = ( pcoll | ParDo(ReifyTimestamps()) | 'IdentityWindow' >> WindowInto( _IdentityWindowFn(windowing_saved.windowfn.get_window_coder()), trigger=AfterCount(1), accumulation_mode=AccumulationMode.DISCARDING, timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST, ) | GroupByKey() | 'ExpandIterable' >> FlatMap(lambda e: [(e[0], value) for value in e[1]]) | ParDo(RestoreTimestamps())) result._windowing = windowing_saved return result
def test_fixed_after_count_accumulating(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k1', 1)]) .advance_watermark_to(2) .add_elements([('k1', 2), ('k2', 2)]) # This values are discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(2), triggerfn=Repeatedly(AfterCount(2)), accumulation_mode=AccumulationMode.ACCUMULATING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 2), [1, 1]), ('k2', IntervalWindow(0, 2), [1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]), ]))
def expand(self, pcoll): if getattr(pcoll.pipeline.runner, 'is_streaming', False): raise NotImplementedError("Requires stateful processing (BEAM-2687)") elif pcoll.windowing.is_default(): # This is the same logic as _GlobalWindowsBatchingDoFn, but optimized # for that simpler case. return pcoll | ParDo(_GlobalWindowsBatchingDoFn( self._batch_size_estimator)) else: return pcoll | ParDo(_WindowAwareBatchingDoFn(self._batch_size_estimator))
def expand(self, pvalue): pvalue = (pvalue | 'ExpandIntoRanges' >> ParDo( _ExpandIntoRanges( self._splittable, self._compression_type, self._desired_bundle_size, self._min_bundle_size))) if self._is_reshuffle: pvalue = pvalue | 'Reshard' >> Reshuffle() return (pvalue | 'ReadRange' >> ParDo( _ReadRange(self._source_from_file, with_filename=self._with_filename)))
def expand(self, pcoll): sdf = self._ptransform.fn signature = DoFnSignature(sdf) restriction_coder = signature.get_restriction_coder() element_coder = typecoders.registry.get_coder(pcoll.element_type) keyed_elements = (pcoll | 'pair' >> ParDo(PairWithRestrictionFn(sdf)) | 'split' >> ParDo(SplitRestrictionFn(sdf)) | 'explode' >> ParDo(ExplodeWindowsFn()) | 'random' >> ParDo(RandomUniqueKeyFn())) return keyed_elements | ProcessKeyedElements( sdf, element_coder, restriction_coder, pcoll.windowing, self._ptransform.args, self._ptransform.kwargs, self._ptransform.side_inputs)
def expand(self, pcoll): input_coder = coders.registry.get_coder(pcoll) return pcoll | ParDo( _pardo_group_into_batches( input_coder, self.params.batch_size, self.params.max_buffering_duration_secs, self.clock))
def expand(self, pcoll): # Imported here to avoid circular dependencies. # pylint: disable=wrong-import-order, wrong-import-position from apache_beam.coders import typecoders input_type = pcoll.element_type if input_type is not None: # Initialize type-hints used below to enforce type-checking and to # pass downstream to further PTransforms. key_type, value_type = trivial_inference.key_value_types(input_type) # Enforce the input to a GBK has a KV element type. pcoll.element_type = typehints.typehints.coerce_to_kv_type( pcoll.element_type) typecoders.registry.verify_deterministic( typecoders.registry.get_coder(key_type), 'GroupByKey operation "%s"' % self.label) reify_output_type = typehints.KV[ key_type, typehints.WindowedValue[value_type]] # type: ignore[misc] gbk_input_type = ( typehints.KV[ key_type, typehints.Iterable[typehints.WindowedValue[ # type: ignore[misc] value_type]]]) gbk_output_type = typehints.KV[key_type, typehints.Iterable[value_type]] # pylint: disable=bad-continuation return ( pcoll | 'ReifyWindows' >> ( ParDo(beam.GroupByKey.ReifyWindows()).with_output_types( reify_output_type)) | 'GroupByKey' >> ( _GroupByKeyOnly().with_input_types( reify_output_type).with_output_types(gbk_input_type)) | ( 'GroupByWindow' >> _GroupAlsoByWindow(pcoll.windowing).with_input_types( gbk_input_type).with_output_types(gbk_output_type))) else: # The input_type is None, run the default return ( pcoll | 'ReifyWindows' >> ParDo(beam.GroupByKey.ReifyWindows()) | 'GroupByKey' >> _GroupByKeyOnly() | 'GroupByWindow' >> _GroupAlsoByWindow(pcoll.windowing))
def expand(self, pcoll): if reify_windows: pcoll = pcoll | ParDo(ReifyTimestampWindow()) keyed_singleton = pcoll.pipeline | Create([(None, None)]) keyed_actual = (pcoll | WindowInto(custom_windowing or window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v))) plain_actual = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda k_values: k_values[1][1])) if custom_windowing: plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow()) plain_actual = plain_actual | "Match" >> Map(matcher)
def expand(self, pcoll): sdf = self._ptransform.fn signature = DoFnSignature(sdf) invoker = DoFnInvoker.create_invoker(signature, process_invocation=False) element_coder = typecoders.registry.get_coder(pcoll.element_type) restriction_coder = invoker.invoke_restriction_coder() keyed_elements = (pcoll | 'pair' >> ParDo(PairWithRestrictionFn(sdf)) | 'split' >> ParDo(SplitRestrictionFn(sdf)) | 'explode' >> ParDo(ExplodeWindowsFn()) | 'random' >> ParDo(RandomUniqueKeyFn())) return keyed_elements | ProcessKeyedElements( sdf, element_coder, restriction_coder, pcoll.windowing, self._ptransform.args, self._ptransform.kwargs)
def inner(fn): sentry_init(default_integrations=False, integrations=[BeamIntegration()]) # Little hack to avoid having to run the whole pipeline. pardo = ParDo(fn) signature = pardo._signature output_processor = _OutputProcessor() return DoFnInvoker.create_invoker(signature, output_processor, DoFnContext("test"))
def test_fixed_windows_simple_watermark(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0), tsv('k1', 2, 0), tsv('k2', 2, 0)]) .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)]) .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)]) .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)]) .advance_watermark_to(1) .add_elements([tsv('k1', 6, 0)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k2', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k1', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k2', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k1', IntervalWindow(0, 1), [6]), # After the watermark ]))
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_sliding_windows_simple_watermark(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k2', 1)]) .advance_watermark_to(1) .add_elements([('k1', 2), ('k2', 2)]) .add_elements([('k1', 2), ('k2', 2)]) .advance_watermark_to(2) .add_elements([('k1', 3), ('k2', 3)]) .add_elements([('k1', 3), ('k2', 3)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(SlidingWindows(2, 1)) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(-1, 1), [1, 1, 1]), ('k2', IntervalWindow(-1, 1), [1, 1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k1', IntervalWindow(2, 4), [3, 3]), ('k2', IntervalWindow(2, 4), [3, 3]), ]))
def expand(self, pcoll): if reify_windows: pcoll = pcoll | ParDo(ReifyTimestampWindow()) keyed_singleton = pcoll.pipeline | Create([(None, None)]) if use_global_window: pcoll = pcoll | WindowInto(window.GlobalWindows()) keyed_actual = pcoll | "ToVoidKey" >> Map(lambda v: (None, v)) # This is a CoGroupByKey so that the matcher always runs, even if the # PCollection is empty. plain_actual = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda k_values: k_values[1][1])) if not use_global_window: plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow()) plain_actual = plain_actual | "Match" >> Map(matcher)
def test_with_trigger_window_that_finish(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)]) .add_elements([tsv('k1', 3, 0)]) .advance_watermark_to(2) .add_elements([tsv('k1', 6, 0)]) # This value is discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), triggerfn=AfterWatermark(), allowed_lateness=0, accumulation_mode=AccumulationMode.DISCARDING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ]))
def expand(self, pcoll): if reify_windows: pcoll = pcoll | ParDo(ReifyTimestampWindow()) # We must have at least a single element to ensure the matcher # code gets run even if the input pcollection is empty. keyed_singleton = pcoll.pipeline | Create([(None, None)]) keyed_actual = ( pcoll | WindowInto(window.GlobalWindows()) | "ToVoidKey" >> Map(lambda v: (None, v))) _ = ((keyed_singleton, keyed_actual) | "Group" >> CoGroupByKey() | "Unkey" >> Map(lambda k___actual_values: k___actual_values[1][1]) | "Match" >> Map(matcher))
def expand(self, pcoll): return pcoll | ParDo(self.add_timestamp_info)
def expand(self, pcoll): return pcoll | ParDo(self.add_window_info)
def expand(self, pcoll): input_coder = coders.registry.get_coder(pcoll) return pcoll | ParDo( _pardo_group_into_batches(self.batch_size, input_coder))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn()) | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn')) | 'Split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(5, 0)) | 'GroupByKey' >> beam.GroupByKey() | 'CountOnes' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteStringsToPubSub(known_args.output_topic) def check_gbk_format(): # A matcher that checks that the output of GBK is of the form word: count. def matcher(elements): # pylint: disable=unused-variable actual_elements_in_window, window = elements for elm in actual_elements_in_window: assert re.match(r'\S+:\s+\d+', elm) is not None return matcher # Check that the format of the output is correct. assert_that( output, check_gbk_format(), use_global_window=False, label='Assert word:count format.') # Check also that elements are ouput in the right window. # This expects exactly 1 occurrence of any subset of the elements # 150, 151, 152, 153, 154 in the window [150, 155) # or exactly 1 occurrence of any subset of the elements # 210, 211, 212, 213, 214 in the window [210, 215). expected_window_to_elements = { window.IntervalWindow(150, 155): [ ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'), ], window.IntervalWindow(210, 215): [ ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'), ], } # To pass, publish numbers in [150-155) or [210-215) with no repeats. # To fail, publish a repeated number in the range above range. # For example: '210 213 151 213' assert_that( output, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='Assert correct streaming windowing.')
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default=default_input, help='Input file to process.') parser.add_argument('--table', dest='table', default=default_table, help='Table to upload.') parser.add_argument( '--dataset', dest='dataset', default=default_dataset, help='Dataset where the table is store. Needs to exists beforehand') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend(['--project={}'.format(project)]) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: def QuestionAPI(tags): import datetime import pandas as pd import requests #python2 to calculate today and yesterday today = datetime.date.today() - datetime.date(1970, 1, 1) yesterday = today - datetime.timedelta(1) from_date = int(yesterday.total_seconds()) to_date = int(today.total_seconds()) logging.info('Calling API for tag: "{}"'.format(tags)) api_url = "http://api.stackexchange.com/2.2/search/advanced?fromdate={0}&todate={1}&order=desc&sort=activity&tagged={2}&site=stackoverflow".format( from_date, to_date, tags) api_call = requests.get(api_url) api_call_dict = eval( api_call.content.replace(b"true", b"True").replace(b"false", b"False")) # Create a DataFrame to simplify data processing try: so_api_call_DF = pd.DataFrame(api_call_dict['items']) if so_api_call_DF.empty: logging.info( 'Tag "{}" does not have questions '.format(tags)) return [] else: so_api_call_DF = so_api_call_DF[[ 'creation_date', 'question_id', 'title', 'link', 'tags', 'is_answered' ]] # Fixing tags and creation_date fields: so_api_call_DF['tags'] = so_api_call_DF.tags.apply( lambda x: ', '.join(x).replace('[', '').replace( ']', '')) so_api_call_dict = so_api_call_DF.to_dict('records') return so_api_call_dict except: logging.warning('Unexpected API request output: \n {}'.format( api_call_dict)) return [] schema = 'creation_date:timestamp,question_id:integer,is_answered:boolean,title:string,tags:string,link:string' api_call = ( p | 'read' >> ReadFromText(known_args.input) | "API call for each Tag" >> ParDo(fn=QuestionAPI) | "Writing to BQ" >> WriteToBigQuery( table=known_args.table, dataset=known_args.dataset, project=project, schema=schema, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND))