def test_timestamped_with_combiners(self): p = TestPipeline() result = (p # Create some initial test values. | 'start' >> Create([(k, k) for k in range(10)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> WindowInto(FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | Map(lambda (x, t): TimestampedValue(x, t)) # We add a 'key' to each value representing the index of the # window. This is important since there is no guarantee of # order for the elements of a PCollection. | Map(lambda v: (v / 5, v))) # Sum all elements associated with a key and window. Although it # is called CombinePerKey it is really CombinePerKeyAndWindow the # same way GroupByKey is really GroupByKeyAndWindow. sum_per_window = result | CombinePerKey(sum) # Compute mean per key and window. mean_per_window = result | combiners.Mean.PerKey() assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') p.run()
def test_create(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3') pipeline.run()
def test_reuse_cloned_custom_transform_instance(self): pipeline = TestPipeline() pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() result1 = pcoll1 | transform result2 = pcoll2 | 'new_label' >> transform assert_that(result1, equal_to([2, 3, 4]), label='r1') assert_that(result2, equal_to([5, 6, 7]), label='r2') pipeline.run()
def test_pardo_side_outputs(self): def tee(elem, *tags): for tag in tags: if tag in elem: yield beam.pvalue.TaggedOutput(tag, elem) with self.create_pipeline() as p: xy = (p | 'Create' >> beam.Create(['x', 'y', 'xy']) | beam.FlatMap(tee, 'x', 'y').with_outputs()) assert_that(xy.x, equal_to(['x', 'xy']), label='x') assert_that(xy.y, equal_to(['y', 'xy']), label='y')
def test_combine_globally_with_default_side_input(self): class CombineWithSideInput(PTransform): def expand(self, pcoll): side = pcoll | CombineGlobally(sum).as_singleton_view() main = pcoll.pipeline | Create([None]) return main | Map(lambda _, s: s, side) p = TestPipeline() result1 = p | 'i1' >> Create([]) | 'c1' >> CombineWithSideInput() result2 = p | 'i2' >> Create([1, 2, 3, 4]) | 'c2' >> CombineWithSideInput() assert_that(result1, equal_to([0]), label='r1') assert_that(result2, equal_to([10]), label='r2') p.run()
def test_flatmap_builtin(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2') pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12]) assert_that(pcoll3, equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3') pcoll4 = pcoll3 | 'do2' >> FlatMap(set) assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4') pipeline.run()
def test_group_by_key(self): with self.create_pipeline() as p: res = (p | beam.Create([('a', 1), ('a', 2), ('b', 3)]) | beam.GroupByKey() | beam.Map(lambda (k, vs): (k, sorted(vs)))) assert_that(res, equal_to([('a', [1, 2]), ('b', [3])]))
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.io.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def test_read(self): with tempfile.NamedTemporaryFile() as temp_file: temp_file.write('a\nb\nc') temp_file.flush() with self.create_pipeline() as p: assert_that(p | beam.io.ReadFromText(temp_file.name), equal_to(['a', 'b', 'c']))
def test_pardo(self): with self.create_pipeline() as p: res = (p | beam.Create(['a', 'bc']) | beam.Map(lambda e: e * 2) | beam.Map(lambda e: e + 'x')) assert_that(res, equal_to(['aax', 'bcbcx']))
def run_pipeline(self, count_implementation, factor=1): p = TestPipeline() words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG']) result = words | count_implementation assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) p.run()
def test_dataflow_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_dataflow_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_top_shorthands(self): pipeline = TestPipeline() pcoll = pipeline | 'start' >> Create([6, 3, 1, 1, 9, 1, 5, 2, 0, 6]) result_top = pcoll | 'top' >> beam.CombineGlobally(combine.Largest(5)) result_bot = pcoll | 'bot' >> beam.CombineGlobally(combine.Smallest(4)) assert_that(result_top, equal_to([[9, 6, 6, 5, 3]]), label='assert:top') assert_that(result_bot, equal_to([[0, 1, 1, 1]]), label='assert:bot') pcoll = pipeline | 'start-perkey' >> Create( [('a', x) for x in [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]]) result_ktop = pcoll | 'top-perkey' >> beam.CombinePerKey(combine.Largest(5)) result_kbot = pcoll | 'bot-perkey' >> beam.CombinePerKey( combine.Smallest(4)) assert_that(result_ktop, equal_to([('a', [9, 6, 6, 5, 3])]), label='k:top') assert_that(result_kbot, equal_to([('a', [0, 1, 1, 1])]), label='k:bot') pipeline.run()
def test_compute_points(self): p = TestPipeline() records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS) result = (records | 'points' >> beam.FlatMap(coders.compute_points) | beam.CombinePerKey(sum)) assert_that(result, equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)])) p.run()
def test_default_value_singleton_side_input(self): pipeline = self.create_pipeline() pcol = pipeline | 'start' >> beam.Create([1, 2]) side = pipeline | 'side' >> beam.Create([]) # 0 values in side input. result = pcol | beam.FlatMap( lambda x, s: [x * s], beam.pvalue.AsSingleton(side, 10)) assert_that(result, equal_to([10, 20])) pipeline.run()
def test_read_gzip_empty_file(self): filename = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_windowing(self): with self.create_pipeline() as p: res = (p | beam.Create([1, 2, 100, 101, 102]) | beam.Map(lambda t: TimestampedValue(('k', t), t)) | beam.WindowInto(beam.transforms.window.Sessions(10)) | beam.GroupByKey() | beam.Map(lambda (k, vs): (k, sorted(vs)))) assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
def test_element(self): class TestDoFn(DoFn): def process(self, element): yield element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = (p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_timestamp_param(self): class TestDoFn(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield timestamp pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP])) pipeline.run()
def test_context_param(self): class TestDoFn(DoFn): def process(self, element, context=DoFn.ContextParam): yield context.element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_iterable_side_input(self): pipeline = self.create_pipeline() pcol = pipeline | 'start' >> beam.Create([1, 2]) side = pipeline | 'side' >> beam.Create([3, 4]) # 2 values in side input. result = pcol | 'compute' >> beam.FlatMap( lambda x, s: [x * y for y in s], beam.pvalue.AsIter(side)) assert_that(result, equal_to([3, 4, 6, 8])) pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = ( p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally(combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_metrics_in_source(self): pipeline = TestPipeline() pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6])) assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6])) res = pipeline.run() metric_results = res.metrics().query() outputs_counter = metric_results['counters'][0] self.assertEqual(outputs_counter.key.step, 'Read') self.assertEqual(outputs_counter.key.metric.name, 'outputs') self.assertEqual(outputs_counter.committed, 6)
def test_sink_transform(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: # pylint: disable=expression-not-assigned p | beam.Create(self.RECORDS) | avroio.WriteToAvro(path, self.SCHEMA) with TestPipeline() as p: # json used for stable sortability readback = p | avroio.ReadFromAvro(path + '*') | beam.Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = ( p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn(min, combine.MeanCombineFn(), max) .with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_flattened_side_input(self): pipeline = self.create_pipeline() main_input = pipeline | 'main input' >> beam.Create([None]) side_input = (pipeline | 'side1' >> beam.Create(['a']), pipeline | 'side2' >> beam.Create(['b'])) | beam.Flatten() results = main_input | beam.FlatMap(lambda _, ab: ab, beam.pvalue.AsList(side_input)) assert_that(results, equal_to(['a', 'b'])) pipeline.run()
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = (p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn( min, combine.MeanCombineFn(), max).with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_deterministic_key(self): p = TestPipeline() lines = (p | beam.Create( ['banana,fruit,3', 'kiwi,fruit,2', 'kiwi,fruit,2', 'zucchini,veg,3'])) # For pickling global Player # pylint: disable=global-variable-not-assigned # [START type_hints_deterministic_key] class Player(object): def __init__(self, team, name): self.team = team self.name = name class PlayerCoder(beam.coders.Coder): def encode(self, player): return '%s:%s' % (player.team, player.name) def decode(self, s): return Player(*s.split(':')) def is_deterministic(self): return True beam.coders.registry.register_coder(Player, PlayerCoder) def parse_player_and_score(csv): name, team, score = csv.split(',') return Player(team, name), int(score) totals = ( lines | beam.Map(parse_player_and_score) | beam.CombinePerKey(sum).with_input_types( beam.typehints.Tuple[Player, int])) # [END type_hints_deterministic_key] assert_that( totals | beam.Map(lambda (k, v): (k.name, v)), equal_to([('banana', 3), ('kiwi', 4), ('zucchini', 3)])) p.run()
def test_timestamped_value(self): p = TestPipeline() result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])])) p.run()
def test_sliding_windows(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected)) p.run()
def test_builtin_combines(self): pipeline = TestPipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] mean = sum(vals) / float(len(vals)) size = len(vals) # First for global combines. pcoll = pipeline | 'start' >> Create(vals) result_mean = pcoll | 'mean' >> combine.Mean.Globally() result_count = pcoll | 'count' >> combine.Count.Globally() assert_that(result_mean, equal_to([mean]), label='assert:mean') assert_that(result_count, equal_to([size]), label='assert:size') # Again for per-key combines. pcoll = pipeline | 'start-perkey' >> Create([('a', x) for x in vals]) result_key_mean = pcoll | 'mean-perkey' >> combine.Mean.PerKey() result_key_count = pcoll | 'count-perkey' >> combine.Count.PerKey() assert_that(result_key_mean, equal_to([('a', mean)]), label='key:mean') assert_that(result_key_count, equal_to([('a', size)]), label='key:size') pipeline.run()
def test_flattened_side_input(self): pipeline = self.create_pipeline() main_input = pipeline | 'main input' >> beam.Create([None]) side_input = ( pipeline | 'side1' >> beam.Create(['a']), pipeline | 'side2' >> beam.Create(['b'])) | beam.Flatten() results = main_input | beam.FlatMap( lambda _, ab: ab, beam.pvalue.AsList(side_input)) assert_that(results, equal_to(['a', 'b'])) pipeline.run()
def test_window_param(self): class TestDoFn(DoFn): def process(self, element, window=DoFn.WindowParam): yield (element, (float(window.start), float(window.end))) pipeline = TestPipeline() pcoll = (pipeline | Create([1, 7]) | Map(lambda x: TimestampedValue(x, x)) | WindowInto(windowfn=SlidingWindows(10, 5)) | ParDo(TestDoFn())) assert_that(pcoll, equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))])) pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn()) assert_that( pcoll2, equal_to([ ((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)), ((7, (0, 10)), (0, 10)), ((7, (5, 15)), (5, 15))]), label='doubled windows') pipeline.run()
def test_read_gzip_with_skip_lines(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder(), skip_header_lines=2) assert_that(pcoll, equal_to(lines[2:])) pipeline.run()
def test_sessions(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected)) p.run()