Пример #1
0
    def test_expected_shard_files(self, temp_dir, stringify, key, count,
                                  shard_name_template):
        file_path_base = temp_dir
        file_name_prefix = 'shard'
        file_path_prefix = pp.join(file_path_base, file_name_prefix)
        file_name_suffix = '.json'

        messages = list(
            self._sample_data(stringify=stringify, key=key, count=count))

        with _TestPipeline() as p:
            messages = p | beam.Create(messages)

            result = messages | WriteToKeyPartitionedFiles(
                key,
                file_path_prefix,
                file_name_suffix,
                shard_name_template=shard_name_template)

            if shard_name_template is None:
                expected = [
                    '%s%s%s' %
                    (pp.join(file_path_base, str(i), file_name_prefix),
                     '-00000-of-00001', file_name_suffix) for i in range(count)
                ]
            else:
                # Of the form ..../shardN.json
                expected = [
                    '%s%s%s' % (pp.join(file_path_base, file_name_prefix),
                                str(i), file_name_suffix) for i in range(count)
                ]

            assert_that(result, equal_to(expected))
Пример #2
0
    def test_as_pipeline(self, temp_dir, shards_per_day):

        file_path_base = temp_dir
        # file_path_base = 'gs://paul-scratch/TestDatePartitionedSink_temp'
        file_name_prefix = 'shard'
        file_path_prefix = pp.join(file_path_base, file_name_prefix)
        file_name_suffix = '.json'

        messages = list(self._sample_data())
        dates = {datetimeFromTimestamp(msg['timestamp']).strftime(DatePartitionedFileSink.DATE_FORMAT)
                 for msg in messages}

        with _TestPipeline() as p:
            writer = WriteToDatePartitionedFiles(file_path_prefix, file_name_suffix,
                                                            shards_per_day=shards_per_day)
            messages = (
                p
                | beam.Create(messages)
                | beam.Map(lambda msg: (TimestampedValue(msg, msg['timestamp'])))
            )

            result = messages | writer

            expected = []
            for date in dates:
                for shard_num in range(shards_per_day):
                    expected.append (
                        ''.join([pp.join(file_path_base, date, file_name_prefix), writer._sink.shard_name_format % dict(
                            shard_num=shard_num, num_shards=shards_per_day), file_name_suffix]
                    ))

            assert_that(result, equal_to(expected))
Пример #3
0
    def test_gcp_sink(self, temp_dir):
        messages = list(MessageGenerator().messages())
        dest = pp.join(temp_dir, 'messages.json')

        with _TestPipeline() as p:
            (p | beam.Create(messages) | GCPSink(dest))
        p.run()

        with open_shards('%s*' % dest) as output:
            assert sorted(messages) == sorted(nlj.load(output))
Пример #4
0
    def test_gcp_source(self, temp_dir):
        expected = list(MessageGenerator().messages())
        source = pp.join(temp_dir, 'messages.json')
        with open(source, 'w') as f:
            nlj.dump(expected, f, json_lib=ujson)

        with _TestPipeline() as p:
            messages = p | GCPSource(source)
        p.run()

        assert_that(messages, equal_to(expected))
Пример #5
0
    def test_type_hints(self):

        messages = MessageGenerator()

        source = beam.Create(messages)
        assert source.get_output_type() == Dict[six.binary_type, Union[float,
                                                                       int]]

        with _TestPipeline() as p:
            result = (p | beam.Create(messages))

            p.run()
Пример #6
0
    def test_gcp_sink(self, temp_dir):
        messages = list(MessageGenerator().messages())
        dest = pp.join(temp_dir, 'messages.json')

        with _TestPipeline() as p:
            (p | beam.Create(messages) | GCPSink(dest))
        p.run()

        with open_shards('%s*' % dest) as output:
            assert (sorted(messages, key=lambda x: x[b'timestamp']) == sorted(
                [fix_keys(d) for d in nlj.load(output)],
                key=lambda x: x[b'timestamp']))
Пример #7
0
    def test_type_hints(self):

        messages = MessageGenerator()

        source = beam.Create(messages)
        assert source.get_output_type() == JSONDict
        assert typecoders.registry._coders.get(
            source.get_output_type()) == JSONDictCoder

        with _TestPipeline() as p:
            result = (p | beam.Create(messages))

            p.run()
Пример #8
0
    def test_Pipeline_parts(self, test_data_dir, temp_dir):
        source = pp.join(test_data_dir, 'input.json')
        messages_sink = pp.join(temp_dir, 'messages')
        segments_sink = pp.join(temp_dir, 'segments')
        expected_messages = pp.join(test_data_dir, 'expected_messages.json')
        expected_segments = pp.join(test_data_dir, 'expected_segments.json')

        with _TestPipeline() as p:
            messages = (
                p
                | beam.io.ReadFromText(file_pattern=source,
                                       coder=JSONDictCoder())
                | "MessagesAddKey" >> beam.Map(SegmentPipeline.groupby_fn)
                | "MessagesGroupByKey" >> beam.GroupByKey())
            segments = p | beam.Create([])
            segmented = messages | Segment(segments)

            messages = segmented[Segment.OUTPUT_TAG_MESSAGES]
            (messages
             | "WriteToMessagesSink" >> beam.io.WriteToText(
                 file_path_prefix=messages_sink,
                 num_shards=1,
                 coder=JSONDictCoder()))

            segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
            (segments
             | "WriteToSegmentsSink" >> beam.io.WriteToText(
                 file_path_prefix=segments_sink,
                 num_shards=1,
                 coder=JSONDictCoder()))

            p.run()
            with nlj.open(expected_messages) as expected:
                with open_shards('%s*' % messages_sink) as output:
                    assert sorted(expected) == sorted(nlj.load(output))

            with nlj.open(expected_segments) as expected_output:
                with open_shards('%s*' % segments_sink) as actual_output:
                    for expected, actual in zip(
                            sorted(expected_output, key=lambda x: x['seg_id']),
                            sorted(nlj.load(actual_output),
                                   key=lambda x: x['seg_id'])):
                        assert set(expected.items()).issubset(
                            set(actual.items()))
Пример #9
0
    def _run_segment(self, messages_in, segments_in, temp_dir):
        messages_file = pp.join(temp_dir, '_run_segment', 'messages')
        segments_file = pp.join(temp_dir, '_run_segment', 'segments')

        with _TestPipeline() as p:
            messages = (
                p | 'CreateMessages' >> beam.Create(messages_in)
                | 'AddKeyMessages' >> beam.Map(self.groupby_fn)
                | "MessagesGroupByKey" >> beam.GroupByKey()
            )
            segments = (
                p | 'CreateSegments' >> beam.Create(segments_in)
                | 'AddKeySegments' >> beam.Map(self.groupby_fn)
                | "SegmentsGroupByKey" >> beam.GroupByKey()
            )
            segmented = (
                messages
                | "Segment" >> Segment(segments)
            )
            messages = segmented['messages']
            segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
            messages | "WriteMessages" >> beam.io.WriteToText(
                messages_file, coder=JSONDictCoder())
            segments | "WriteSegments" >> beam.io.WriteToText(
                segments_file, coder=JSONDictCoder())

            p.run()

            with open_shards('%s*' % messages_file) as output:
                messages = sorted(list(nlj.load(output)), key=lambda m: (m['ssvid'], m['timestamp']))
            with open_shards('%s*' % segments_file) as output:
                segments = list(nlj.load(output))

            assert list_contains(messages, messages_in)

            return messages, segments