def test_pipeline_read_all_multiple_files_large(self): pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> beam.Create( [testdata_util.get_full_file_path('valid-4.0.vcf'), testdata_util.get_full_file_path('valid-4.1-large.vcf'), testdata_util.get_full_file_path('valid-4.2.vcf')]) | 'Read' >> ReadAllFromVcf()) assert_that(pcoll, asserts.count_equals_to(9900)) pipeline.run()
def test_pipeline_read_all_multiple_files(self): with TempDir() as tempdir: file_name_1 = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) file_name_2 = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> beam.Create([file_name_1, file_name_2]) | 'Read' >> ReadAllFromVcf()) assert_that(pcoll, asserts.count_equals_to(2 * len(_SAMPLE_TEXT_LINES))) pipeline.run()
def _assert_pipeline_read_files_record_count_equal( self, input_pattern, expected_count, use_read_all=False): """Helper method for verifying total records read. Args: input_pattern (str): Input file pattern to read. expected_count (int): Expected number of reacords that was read. use_read_all (bool): Whether to use the scalable ReadAllFromVcf transform instead of ReadFromVcf. """ pipeline = TestPipeline() if use_read_all: pcoll = (pipeline | 'Create' >> beam.Create([input_pattern]) | 'Read' >> ReadAllFromVcf()) else: pcoll = pipeline | 'Read' >> ReadFromVcf(input_pattern) assert_that(pcoll, asserts.count_equals_to(expected_count)) pipeline.run()