Пример #1
0
    def test_Pipeline_parts(self, test_data_dir, temp_dir):
        source = pp.join(test_data_dir, 'input.json')
        messages_sink = pp.join(temp_dir, 'messages')
        segments_sink = pp.join(temp_dir, 'segments')
        expected_messages = pp.join(test_data_dir, 'expected_messages.json')
        expected_segments = pp.join(test_data_dir, 'expected_segments.json')

        with _TestPipeline() as p:
            messages = (
                p
                | beam.io.ReadFromText(file_pattern=source,
                                       coder=JSONDictCoder())
                | "MessagesAddKey" >> beam.Map(SegmentPipeline.groupby_fn)
                | "MessagesGroupByKey" >> beam.GroupByKey())
            segments = p | beam.Create([])
            segmented = messages | Segment(segments)

            messages = segmented[Segment.OUTPUT_TAG_MESSAGES]
            (messages
             | "WriteToMessagesSink" >> beam.io.WriteToText(
                 file_path_prefix=messages_sink,
                 num_shards=1,
                 coder=JSONDictCoder()))

            segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
            (segments
             | "WriteToSegmentsSink" >> beam.io.WriteToText(
                 file_path_prefix=segments_sink,
                 num_shards=1,
                 coder=JSONDictCoder()))

            p.run()
            with nlj.open(expected_messages) as expected:
                with open_shards('%s*' % messages_sink) as output:
                    assert sorted(expected) == sorted(nlj.load(output))

            with nlj.open(expected_segments) as expected_output:
                with open_shards('%s*' % segments_sink) as actual_output:
                    for expected, actual in zip(
                            sorted(expected_output, key=lambda x: x['seg_id']),
                            sorted(nlj.load(actual_output),
                                   key=lambda x: x['seg_id'])):
                        assert set(expected.items()).issubset(
                            set(actual.items()))
Пример #2
0
    def test_gcp_sink(self, temp_dir):
        messages = list(MessageGenerator().messages())
        dest = pp.join(temp_dir, 'messages.json')

        with _TestPipeline() as p:
            (p | beam.Create(messages) | GCPSink(dest))
        p.run()

        with open_shards('%s*' % dest) as output:
            assert sorted(messages) == sorted(nlj.load(output))
Пример #3
0
    def _run_pipeline(self, tag_field, tag_value, dest, expected, args=[]):
        args += [
            '--tag_field=%s' % tag_field,
            '--tag_value=%s' % tag_value,
            '--dest=%s' % dest, '--wait'
        ]

        pipe_template.__main__.run(args)

        with open_shards('%s*' % dest) as output:
            assert sorted(expected, key=lambda x: x['idx']) == sorted(
                nlj.load(output), key=lambda x: x['idx'])
Пример #4
0
    def test_gcp_sink(self, temp_dir):
        messages = list(MessageGenerator().messages())
        dest = pp.join(temp_dir, 'messages.json')

        with _TestPipeline() as p:
            (p | beam.Create(messages) | GCPSink(dest))
        p.run()

        with open_shards('%s*' % dest) as output:
            assert (sorted(messages, key=lambda x: x[b'timestamp']) == sorted(
                [fix_keys(d) for d in nlj.load(output)],
                key=lambda x: x[b'timestamp']))
Пример #5
0
    def _run_segment(self, messages_in, segments_in, temp_dir):
        messages_file = pp.join(temp_dir, '_run_segment', 'messages')
        segments_file = pp.join(temp_dir, '_run_segment', 'segments')

        with _TestPipeline() as p:
            messages = (
                p | 'CreateMessages' >> beam.Create(messages_in)
                | 'AddKeyMessages' >> beam.Map(self.groupby_fn)
                | "MessagesGroupByKey" >> beam.GroupByKey()
            )
            segments = (
                p | 'CreateSegments' >> beam.Create(segments_in)
                | 'AddKeySegments' >> beam.Map(self.groupby_fn)
                | "SegmentsGroupByKey" >> beam.GroupByKey()
            )
            segmented = (
                messages
                | "Segment" >> Segment(segments)
            )
            messages = segmented['messages']
            segments = segmented[Segment.OUTPUT_TAG_SEGMENTS]
            messages | "WriteMessages" >> beam.io.WriteToText(
                messages_file, coder=JSONDictCoder())
            segments | "WriteSegments" >> beam.io.WriteToText(
                segments_file, coder=JSONDictCoder())

            p.run()

            with open_shards('%s*' % messages_file) as output:
                messages = sorted(list(nlj.load(output)), key=lambda m: (m['ssvid'], m['timestamp']))
            with open_shards('%s*' % segments_file) as output:
                segments = list(nlj.load(output))

            assert list_contains(messages, messages_in)

            return messages, segments
Пример #6
0
    def _run_pipeline(self,
                      source,
                      messages_sink,
                      segments_sink,
                      expected,
                      args=[]):
        args += [
            '--source=%s' % source, '--source_schema={"fields": []}',
            '--dest=%s' % messages_sink,
            '--segments=%s' % segments_sink, '--wait'
        ]

        pipe_segment_run(args)

        with nlj.open(expected) as expected:
            with open_shards('%s*' % messages_sink) as output:
                assert sorted(expected) == sorted(nlj.load(output))
Пример #7
0
def test_load(dicts_path, compare_iter):
    with nlj.open(dicts_path) as e:
        with open(dicts_path) as f:
            with nlj.load(f) as a:
                compare_iter(e, a)