def test_flatmap_builtin(self): with TestPipeline() as pipeline: pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2') pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12]) assert_that( pcoll3, equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3') pcoll4 = pcoll3 | 'do2' >> FlatMap(set) assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4')
def expand(self, p): return (p | 'clean from HTML' >> Map(PreProcessing.clean_html) | 'remove mentions and links' >> Map( PreProcessing.remove_mentions_and_links) | 'lowercase' >> Map(PreProcessing.make_lower) | 'remove negations' >> Map(PreProcessing.remove_negations) | 'letter only' >> Map(PreProcessing.letter_only) | 'remove small words' >> Map(PreProcessing.remove_small_words, self.tok))
def test_window_param(self): class TestDoFn(NewDoFn): def process(self, element, window=NewDoFn.WindowParam): yield (element, (float(window.start), float(window.end))) pipeline = TestPipeline() pcoll = (pipeline | Create([1, 7]) | Map(lambda x: TimestampedValue(x, x)) | WindowInto(windowfn=SlidingWindows(10, 5)) | ParDo(TestDoFn())) assert_that( pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))])) pipeline.run()
def test_rewindow(self): p = TestPipeline() result = (p | Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'window' >> WindowInto(SlidingWindows(period=2, size=6)) # Per the model, each element is now duplicated across # three windows. Rewindowing must preserve this duplication. | 'rewindow' >> WindowInto(FixedWindows(5)) | 'rewindow2' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)), ('key', sorted([5, 6, 7, 8, 9] * 3))])) p.run()
def test_incomparable_default(self): class IncomparableType(object): def __eq__(self, other): raise RuntimeError() def __ne__(self, other): raise RuntimeError() def __hash__(self): raise RuntimeError() # Ensure that we don't use default values in a context where they must be # comparable (see BEAM-8301). with TestPipeline() as pipeline: pcoll = ( pipeline | beam.Create([None]) | Map(lambda e, x=IncomparableType(): (e, type(x).__name__))) assert_that(pcoll, equal_to([(None, 'IncomparableType')]))
def test_window_param(self): class TestDoFn(DoFn): def process(self, element, window=DoFn.WindowParam): yield (element, (float(window.start), float(window.end))) with TestPipeline() as pipeline: pcoll = ( pipeline | Create([1, 7]) | Map(lambda x: TimestampedValue(x, x)) | WindowInto(windowfn=SlidingWindows(10, 5)) | ParDo(TestDoFn())) assert_that( pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))])) pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn()) assert_that( pcoll2, equal_to([((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)), ((7, (0, 10)), (0, 10)), ((7, (5, 15)), (5, 15))]), label='doubled windows')
def expand(self, pcoll): return pcoll | Map(lambda x: x + self.suffix)
def expand(self, pcoll): return (pcoll | 'Convert to Mutation' >> Map(self._mutation_fn) | 'Write Mutation to Datastore' >> ParDo( _Mutate.DatastoreWriteFn(self._project)))
def timestamped_key_values(self, pipeline, key, *timestamps): return (pipeline | 'start' >> Create(timestamps) | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()])))
def test_eager_pipeline(self): p = Pipeline('EagerRunner') self.assertEqual([1, 4, 9], p | Create([1, 2, 3]) | Map(lambda x: x * x))
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. options = StandardOptions(streaming=True) options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def expand(self, pcoll): pcoll = pcoll | Map(WriteToPubSubLite._message_to_proto_str) pcoll.element_type = bytes pcoll = pcoll | self._source return pcoll
def expand(self, pvalue): pcoll = pvalue.pipeline | Read(self._source) pcoll.element_type = bytes pcoll = pcoll | 'DecodeString' >> Map(lambda b: b.decode('utf-8')) pcoll.element_type = unicode return pcoll
from apache_beam.transforms.window import IntervalWindow from apache_beam.transforms.window import Sessions from apache_beam.transforms.window import SlidingWindows from apache_beam.transforms.window import TimestampCombiner from apache_beam.transforms.window import TimestampedValue from apache_beam.transforms.window import WindowedValue from apache_beam.transforms.window import WindowFn from apache_beam.utils.timestamp import MAX_TIMESTAMP from apache_beam.utils.timestamp import MIN_TIMESTAMP def context(element, timestamp): return WindowFn.AssignContext(timestamp, element) sort_values = Map(lambda k_vs: (k_vs[0], sorted(k_vs[1]))) class ReifyWindowsFn(core.DoFn): def process(self, element, window=core.DoFn.WindowParam): key, values = element yield "%s @ %s" % (key, window), values reify_windows = core.ParDo(ReifyWindowsFn()) class WindowTest(unittest.TestCase): def test_timestamped_value_cmp(self): self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2))
def expand(self, pvalue): p = (pvalue.pipeline | ReadFromPubSub(self.topic, self.subscription, self.id_label) | 'DecodeString' >> Map(lambda b: b.decode('utf-8'))) p.element_type = basestring return p
def expand(self, pcoll): pcoll = pcoll | 'EncodeString' >> Map(lambda s: s.encode('utf-8')) pcoll.element_type = bytes return pcoll | Write(self._sink)
def expand(self, pvalue): pcoll = pvalue.pipeline | self._source pcoll.element_type = bytes pcoll = pcoll | Map(pubsublite.SequencedMessage.deserialize) pcoll.element_type = pubsublite.SequencedMessage return pcoll