def test_Pipeline_parts(self, test_data_dir, temp_dir): source = pp.join(test_data_dir, 'input.json') messages_sink = pp.join(temp_dir, 'messages') segments_sink = pp.join(temp_dir, 'segments') expected_messages = pp.join(test_data_dir, 'expected_messages.json') expected_segments = pp.join(test_data_dir, 'expected_segments.json') with _TestPipeline() as p: messages = ( p | beam.io.ReadFromText(file_pattern=source, coder=JSONDictCoder()) | "MessagesAddKey" >> beam.Map(SegmentPipeline.groupby_fn) | "MessagesGroupByKey" >> beam.GroupByKey()) segments = p | beam.Create([]) segmented = messages | Segment(segments) messages = segmented[Segment.OUTPUT_TAG_MESSAGES] (messages | "WriteToMessagesSink" >> beam.io.WriteToText( file_path_prefix=messages_sink, num_shards=1, coder=JSONDictCoder())) segments = segmented[Segment.OUTPUT_TAG_SEGMENTS] (segments | "WriteToSegmentsSink" >> beam.io.WriteToText( file_path_prefix=segments_sink, num_shards=1, coder=JSONDictCoder())) p.run() with nlj.open(expected_messages) as expected: with open_shards('%s*' % messages_sink) as output: assert sorted(expected) == sorted(nlj.load(output)) with nlj.open(expected_segments) as expected_output: with open_shards('%s*' % segments_sink) as actual_output: for expected, actual in zip( sorted(expected_output, key=lambda x: x['seg_id']), sorted(nlj.load(actual_output), key=lambda x: x['seg_id'])): assert set(expected.items()).issubset( set(actual.items()))
def test_gcp_sink(self, temp_dir): messages = list(MessageGenerator().messages()) dest = pp.join(temp_dir, 'messages.json') with _TestPipeline() as p: (p | beam.Create(messages) | GCPSink(dest)) p.run() with open_shards('%s*' % dest) as output: assert sorted(messages) == sorted(nlj.load(output))
def _run_pipeline(self, tag_field, tag_value, dest, expected, args=[]): args += [ '--tag_field=%s' % tag_field, '--tag_value=%s' % tag_value, '--dest=%s' % dest, '--wait' ] pipe_template.__main__.run(args) with open_shards('%s*' % dest) as output: assert sorted(expected, key=lambda x: x['idx']) == sorted( nlj.load(output), key=lambda x: x['idx'])
def test_gcp_sink(self, temp_dir): messages = list(MessageGenerator().messages()) dest = pp.join(temp_dir, 'messages.json') with _TestPipeline() as p: (p | beam.Create(messages) | GCPSink(dest)) p.run() with open_shards('%s*' % dest) as output: assert (sorted(messages, key=lambda x: x[b'timestamp']) == sorted( [fix_keys(d) for d in nlj.load(output)], key=lambda x: x[b'timestamp']))
def _run_segment(self, messages_in, segments_in, temp_dir): messages_file = pp.join(temp_dir, '_run_segment', 'messages') segments_file = pp.join(temp_dir, '_run_segment', 'segments') with _TestPipeline() as p: messages = ( p | 'CreateMessages' >> beam.Create(messages_in) | 'AddKeyMessages' >> beam.Map(self.groupby_fn) | "MessagesGroupByKey" >> beam.GroupByKey() ) segments = ( p | 'CreateSegments' >> beam.Create(segments_in) | 'AddKeySegments' >> beam.Map(self.groupby_fn) | "SegmentsGroupByKey" >> beam.GroupByKey() ) segmented = ( messages | "Segment" >> Segment(segments) ) messages = segmented['messages'] segments = segmented[Segment.OUTPUT_TAG_SEGMENTS] messages | "WriteMessages" >> beam.io.WriteToText( messages_file, coder=JSONDictCoder()) segments | "WriteSegments" >> beam.io.WriteToText( segments_file, coder=JSONDictCoder()) p.run() with open_shards('%s*' % messages_file) as output: messages = sorted(list(nlj.load(output)), key=lambda m: (m['ssvid'], m['timestamp'])) with open_shards('%s*' % segments_file) as output: segments = list(nlj.load(output)) assert list_contains(messages, messages_in) return messages, segments
def _run_pipeline(self, source, messages_sink, segments_sink, expected, args=[]): args += [ '--source=%s' % source, '--source_schema={"fields": []}', '--dest=%s' % messages_sink, '--segments=%s' % segments_sink, '--wait' ] pipe_segment_run(args) with nlj.open(expected) as expected: with open_shards('%s*' % messages_sink) as output: assert sorted(expected) == sorted(nlj.load(output))
def test_load(dicts_path, compare_iter): with nlj.open(dicts_path) as e: with open(dicts_path) as f: with nlj.load(f) as a: compare_iter(e, a)