def test_validation_failure_for_empty_input_file(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=[]) with self.assertRaisesRegexp(ValueError, 'Input file .* is empty.'): pipeline_common._get_all_patterns(input_pattern=None, input_file=filename)
def test_write_headers(self): header = _get_vcf_header_from_lines(self.lines) with temp_dir.TempDir() as tempdir: tempfile = tempdir.create_temp_file(suffix='.vcf') header_fn = WriteVcfHeaderFn(tempfile) header_fn.process(header) self._assert_file_contents_equal(tempfile, self.lines)
def test_config_failed_missing_partition_name(self): tempdir = temp_dir.TempDir() missing_par_name = [ '- partition:', ' regions:', ' - "chr1:0-1,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Each partition must have partition_name field.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_par_name))) empty_par_name = [ '- partition:', ' partition_name: " "', ' regions:', ' - "chr1:0-1,000,000"', ] with self.assertRaisesRegexp( ValueError, 'Partition name can not be empty string.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(empty_par_name)))
def test_pipeline_read_all_file_pattern(self): with temp_dir.TempDir() as tempdir: headers_1 = [self.lines[1], self.lines[-1]] headers_2 = [self.lines[2], self.lines[3], self.lines[-1]] headers_3 = [self.lines[4], self.lines[-1]] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3) pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> beam.Create( [os.path.join(tempdir.get_path(), '*.vcf')]) | 'ReadHeaders' >> ReadAllVcfHeaders()) expected = [ _get_vcf_header_from_lines(h, file_name=file_name) for h, file_name in [( headers_1, file_name_1), (headers_2, file_name_2), (headers_3, file_name_3)] ] assert_that(pcoll, asserts.header_vars_equal(expected)) pipeline.run()
def test_pipeline_read_all_file_pattern(self): with temp_dir.TempDir() as tempdir: lines_1 = self.headers[1:2] + self.headers[-1:] + self.records[:2] lines_2 = self.headers[2:4] + self.headers[-1:] + self.records[2:4] lines_3 = self.headers[4:5] + self.headers[-1:] + self.records[4:] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=lines_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=lines_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=lines_3) pipeline = TestPipeline() pcoll = pipeline | 'ReadHeaders' >> GetEstimates( os.path.join(tempdir.get_path(), '*.vcf')) pcoll = (pipeline | 'Create' >> beam.Create( [os.path.join(tempdir.get_path(), '*.vcf')]) | 'GetAllEstimates' >> GetAllEstimates()) expected = [ _get_estimate_from_lines(lines, file_name=file_name) for lines, file_name in [( lines_1, file_name_1), (lines_2, file_name_2), (lines_3, file_name_3)] ] assert_that(pcoll, asserts.header_vars_equal(expected)) pipeline.run()
def test_config_failed_duplicate_residual_shard(self): tempdir = temp_dir.TempDir() duplicate_residual = [ '- output_table:', ' table_name_suffix: "all_remaining"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "chr01"', ' regions:', ' - "chr1"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "all_remaining_2"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, there can be only one residual output*' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(duplicate_residual)))
def test_config_failed_duplicate_table_name(self): tempdir = temp_dir.TempDir() dup_table_name = [ '- output_table:', ' table_name_suffix: "duplicate_name"', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "all_remaining"', ' regions:', ' - "residual"', ' partition_range_end: 999999999', '- output_table:', ' table_name_suffix: "duplicate_name"', ' regions:', ' - "chr1:1,000,000-2,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table name suffixes must be unique*' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(dup_table_name)))
def test_config_failed_missing_shard_name(self): tempdir = temp_dir.TempDir() missing_par_name = [ '- output_table:', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix field missing.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_par_name))) empty_par_name = [ '- output_table:', ' table_name_suffix: " "', ' regions:', ' - "chr1:0-1,000,000"', ' partition_range_end: 999999999', ] with self.assertRaisesRegex( ValueError, 'Wrong sharding config file, table_name_suffix can not be empty.' ): _ = variant_sharding.VariantSharding( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(empty_par_name)))
def _create_file_and_read_headers(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(suffix='.vcf', lines=self.lines) headers = source_test_utils.read_from_source( VcfHeaderSource(filename)) return headers[0]
def test_write_vcf_data_header(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT \n' ] with temp_dir.TempDir() as tempdir: representative_header = tempdir.create_temp_file(lines=lines) file_path = filesystems.FileSystems.join(tempdir.get_path(), 'data_header') bq_to_vcf._write_vcf_header_with_call_names( ['Sample 1', 'Sample 2'], ['#CHROM', 'POS', 'ID', 'REF', 'ALT'], representative_header, file_path) expected_content = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n', '#CHROM\tPOS\tID\tREF\tALT\tSample 1\tSample 2\n' ] with filesystems.FileSystems.open(file_path) as f: content = f.readlines() self.assertEqual(content, expected_content)
def test_write_to_shards(self): with temp_dir.TempDir() as tempdir: shards_writter = write_variants_to_shards._WriteVariantsToVCFShards( tempdir.get_path(), 3) variants = self._get_variants() variant_lines = [ shards_writter._coder.encode(v).strip('\n') for v in variants ] shards_writter._write_variant_lines_to_vcf_shard(variant_lines) expected_content = [ '\t'.join([ '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT\n' ]), '\t'.join([ '19', '12', 'rs1', 'C', 'A,TT', '2', 'PASS', 'A1=some data;A2=data1,data2', '.\n' ]), '\t'.join([ '19', '12', 'rs1', 'C', 'A,TT', '20', 'q10', 'A1=some data2;A3=data3,data4', '.' ]) ] file_paths = [] for dirpath, _, filenames in os.walk(tempdir.get_path()): for f in filenames: file_paths.append(os.path.abspath(os.path.join(dirpath, f))) self.assertEqual(1, len(file_paths)) with filesystems.FileSystems.open(file_paths[0]) as f: content = f.readlines() self.assertEqual(content, expected_content)
def test_write_to_shards_pipeline(self): with temp_dir.TempDir() as tempdir: pipeline = TestPipeline() _ = (pipeline | Create(self._get_variants()) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( tempdir.get_path(), ['Sample 1', 'Sample 2'])) pipeline.run()
def test_print_estimates_to_file(self): with temp_dir.TempDir() as tempdir: file_path = os.path.join(tempdir.get_path(), 'test_file_name') extract_input_size.print_estimates_to_file(1, 2, 3, 4, 5, file_path) with FileSystems.open(file_path) as f: lines = f.readlines() self.assertEqual([int(line.strip()) for line in lines], [1, 2, 3, 4, 5])
def test_write_dataflow(self): header = _get_vcf_header_from_lines(self.lines) with temp_dir.TempDir() as tempdir: tempfile = tempdir.create_temp_file(suffix='.vcf') pipeline = TestPipeline() pcoll = pipeline | beam.Create([header]) _ = pcoll | 'Write' >> WriteVcfHeaders(tempfile) pipeline.run() self._assert_file_contents_equal(tempfile, self.lines)
def test_get_mode_optimize_set(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=self.SAMPLE_LINES) args = self._create_mock_args(input_pattern=None, input_file=filename, optimize_for_large_inputs=True) self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE)
def test_empty_file(self): lines = [] with temp_dir.TempDir() as tempdir: file_path = self._create_temp_vcf_file(lines, tempdir) try: vcf_header_parser.get_vcf_headers(file_path) self.fail('Empty VCF file must throw an exception.') except ValueError: pass
def test_preprocess_no_conflicts(self): with temp_dir.TempDir() as tempdir: report_path = filesystems.FileSystems.join( tempdir.get_path(), PreprocessTest._REPORT_NAME) argv = [ '--input_pattern', 'gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf', '--report_path', report_path, '--report_all_conflicts' ] vcf_to_bq_preprocess.run(argv) assert filesystems.FileSystems.exists(report_path)
def test_write_vcf_data_header(self): with temp_dir.TempDir() as tempdir: file_path = filesystems.FileSystems.join(tempdir.get_path(), 'data_header') bq_to_vcf._write_vcf_data_header(['Sample 1', 'Sample 2'], ['#CHROM', 'POS', 'ID', 'REF', 'ALT'], file_path) expected_content = '#CHROM\tPOS\tID\tREF\tALT\tSample 1\tSample 2\n' with filesystems.FileSystems.open(file_path) as f: content = f.readlines() self.assertEqual(content, [expected_content])
def test_pipeline_read_file_headers(self): headers = self.lines self.lines = testdata_util.get_sample_vcf_file_lines() with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(suffix='.vcf', lines=self.lines) pipeline = TestPipeline() pcoll = pipeline | 'ReadHeaders' >> ReadVcfHeaders(filename) assert_that(pcoll, equal_to([_get_vcf_header_from_lines(headers)])) pipeline.run()
def test_get_mode_small_still_large(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=self.SAMPLE_LINES) args = self._create_mock_args(input_pattern=None, input_file=filename) match_result = collections.namedtuple('MatchResult', ['metadata_list']) match = match_result([None for _ in range(100)]) with mock.patch.object(FileSystems, 'match', return_value=[match]): self.assertEqual(self._get_pipeline_mode(args), PipelineModes.LARGE)
def test_failure_for_conflicting_flags_no_errors_with_file_input(self): lines = [ './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n', './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n', './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n' ] with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=lines) args = self._make_args([ '--input_file', filename, '--representative_header_file', 'gs://some_file' ]) self._options.validate(args)
def test_get_metadata_header_lines(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n', '19 1234567 mi1 G T 50 PASS NS=3 GT:GQ:DP 0/1:35:4 0/2:17:2',] with temp_dir.TempDir() as tempdir: file_path = self._create_temp_vcf_file(lines, tempdir) header_lines = vcf_header_parser.get_metadata_header_lines(file_path) self.assertEqual(header_lines, lines[:-2])
def test_one_file(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="GQ">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n'] with temp_dir.TempDir() as tempdir: file_path = self._create_temp_vcf_file(lines, tempdir) header_fields = vcf_header_parser.get_vcf_headers(file_path) self.assertItemsEqual(['NS', 'AF'], header_fields.infos.keys()) self.assertItemsEqual(['GT', 'GQ'], header_fields.formats.keys())
def test_validation_failure_for_wrong_pattern_in_input_file(self): lines = [ './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n', 'non_existent.vcf\n', './gcp_variant_transforms/testing/data/vcf/valid-4.0.vcf\n' ] with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(lines=lines) with self.assertRaisesRegex( ValueError, 'Input pattern .* from .* did not match any files.'): pipeline_common._get_all_patterns(input_pattern=None, input_file=filename)
def test_pipeline_read_file_headers(self): with temp_dir.TempDir() as tempdir: filename = tempdir.create_temp_file(suffix='.vcf', lines=self.lines) pipeline = TestPipeline() pcoll = pipeline | 'GetEstimates' >> GetEstimates(filename) assert_that( pcoll, equal_to([_get_estimate_from_lines(self.lines, filename)])) pipeline.run()
def test_invalid_file(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '##INFO=<ID=AF,Number=A,Type=Float,Desc\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n'] with temp_dir.TempDir() as tempdir: file_path = self._create_temp_vcf_file(lines, tempdir) try: vcf_header_parser.get_vcf_headers(file_path) self.fail('Invalid VCF file must throw an exception.') except ValueError: pass
def test_preprocess_run_locally(self): with temp_dir.TempDir() as tempdir: report_path = filesystems.FileSystems.join( tempdir.get_path(), PreprocessTest._REPORT_NAME) resolved_headers_path = filesystems.FileSystems.join( tempdir.get_path(), PreprocessTest._RESOLVED_HEADERS_FILE_NAME) argv = [ '--input_pattern', 'gs://gcp-variant-transforms-testfiles/small_tests/infer-undefined' '-header-fields.vcf', '--report_all_conflicts', '--report_path', report_path, '--resolved_headers_path', resolved_headers_path ] vcf_to_bq_preprocess.run(argv) assert filesystems.FileSystems.exists(report_path) assert filesystems.FileSystems.exists(resolved_headers_path)
def test_read_file_pattern(self): with temp_dir.TempDir() as tempdir: headers_1 = [self.lines[1], self.lines[-1]] headers_2 = [self.lines[2], self.lines[3], self.lines[-1]] headers_3 = [self.lines[4], self.lines[-1]] file_name_1 = tempdir.create_temp_file(suffix='.vcf', lines=headers_1) file_name_2 = tempdir.create_temp_file(suffix='.vcf', lines=headers_2) file_name_3 = tempdir.create_temp_file(suffix='.vcf', lines=headers_3) actual = source_test_utils.read_from_source(VcfHeaderSource( os.path.join(tempdir.get_path(), '*.vcf'))) expected = [_get_vcf_header_from_lines(h, file_name=file_name) for h, file_name in [(headers_1, file_name_1), (headers_2, file_name_2), (headers_3, file_name_3)]] asserts.header_vars_equal(expected)(actual)
def _generate_report_and_assert_contents_equal( self, expected_content, # type: List[str] header_definitions, # type: merge_header_definitions.VcfHeaderDefinitions resolved_headers=None, # type: VcfHeader inferred_headers=None, # type: VcfHeader malformed_records=None # type: List[vcfio.MalformedVcfRecord] ): # type: (...) -> None with temp_dir.TempDir() as tempdir: file_path = FileSystems.join(tempdir.get_path(), PreprocessReporterTest._REPORT_NAME) preprocess_reporter.generate_report(header_definitions, file_path, resolved_headers, inferred_headers, malformed_records) with FileSystems.open(file_path) as f: reader = f.readlines() self.assertItemsEqual(reader, expected_content)
def test_config_failed_missing_region(self): tempdir = temp_dir.TempDir() missing_region = [ '- partition:', ' partition_name: "chr01_part1"', ' regions:', ' - "chr1:0-1,000,000"', '- partition:', ' partition_name: "all_remaining"', ' regions:', ' - "residual"', '- partition:', ' partition_name: "missing_region"', ' regions:', ] with self.assertRaisesRegexp( ValueError, 'Each partition must have at least one region.'): _ = variant_partition.VariantPartition( tempdir.create_temp_file(suffix='.yaml', lines='\n'.join(missing_region)))