def test_merge_header_definitions_no_conflicting_headers(self): lines_1 = [ '##FORMAT=<ID=NS,Number=1,Type=Float,Description="Number samples">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n' ] lines_2 = [ '##FORMAT=<ID=DP,Number=2,Type=Float,Description="Total Depth">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample3\n' ] vcf_reader_1 = vcf.Reader(fsock=iter(lines_1)) vcf_reader_2 = vcf.Reader(fsock=iter(lines_2)) headers_1 = self._get_vcf_header_from_reader(vcf_reader_1, 'file1') headers_2 = self._get_vcf_header_from_reader(vcf_reader_2, 'file2') pipeline = TestPipeline() merged_definitions = ( pipeline | Create([headers_1, headers_2]) | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) expected = VcfHeaderDefinitions() expected._formats = {'NS': {Definition(1, 'Float'): ['file1']}, 'DP': {Definition(2, 'Float'): ['file2']}} assert_that(merged_definitions, equal_to([expected])) pipeline.run()
def test_merge_header_definitions_save_five_copies(self): lines_1 = [ '##INFO=<ID=NS,Number=1,Type=Float,Description="Number samples">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n' ] lines_2 = [ '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample3\n' ] vcf_reader_1 = vcf.Reader(fsock=iter(lines_1)) vcf_reader_2 = vcf.Reader(fsock=iter(lines_2)) file_names = ['file1', 'file2', 'file3', 'file4', 'file5', 'file6'] headers = [] for file_name in file_names: headers.append(self._get_vcf_header_from_reader(vcf_reader_1, file_name)) headers.append(self._get_vcf_header_from_reader(vcf_reader_2, 'file7')) pipeline = TestPipeline() merged_definitions = ( pipeline | Create(headers) | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) expected = VcfHeaderDefinitions() expected._infos = { 'NS': {Definition(1, 'Float'): ['file1', 'file2', 'file3', 'file4', 'file5'], Definition(1, 'Integer'): ['file7']}} assert_that(merged_definitions, equal_to([expected])) pipeline.run()
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) all_patterns = known_args.all_patterns pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns) merged_headers = pipeline_common.get_merged_headers(headers) merged_definitions = ( headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: if len(all_patterns) == 1: variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], allow_malformed_records=True) else: variants = (p | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(allow_malformed_records=True)) malformed_records = variants | filter_variants.ExtractMalformedVariants( ) inferred_headers, merged_headers = (_get_inferred_headers( variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: pipeline_common.write_headers(merged_headers, known_args.resolved_headers_path)
def test_merge_header_definitions_one_header(self): lines = [ '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\n' ] vcf_reader = vcf.Reader(fsock=iter(lines)) headers = self._get_vcf_header_from_reader(vcf_reader, 'file1') pipeline = TestPipeline() merged_definitions = ( pipeline | Create([headers]) | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) expected = VcfHeaderDefinitions() expected._infos = {'NS': {Definition(1, 'Integer'): ['file1']}} assert_that(merged_definitions, equal_to([expected])) pipeline.run()