def test_merge_header_definitions_no_conflicting_headers(self):
    lines_1 = [
        '##FORMAT=<ID=NS,Number=1,Type=Float,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
    ]
    lines_2 = [
        '##FORMAT=<ID=DP,Number=2,Type=Float,Description="Total Depth">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample3\n'
    ]

    vcf_reader_1 = vcf.Reader(fsock=iter(lines_1))
    vcf_reader_2 = vcf.Reader(fsock=iter(lines_2))
    headers_1 = self._get_vcf_header_from_reader(vcf_reader_1, 'file1')
    headers_2 = self._get_vcf_header_from_reader(vcf_reader_2, 'file2')
    pipeline = TestPipeline()
    merged_definitions = (
        pipeline
        | Create([headers_1, headers_2])
        | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

    expected = VcfHeaderDefinitions()
    expected._formats = {'NS': {Definition(1, 'Float'): ['file1']},
                         'DP': {Definition(2, 'Float'): ['file2']}}
    assert_that(merged_definitions, equal_to([expected]))
    pipeline.run()
  def test_merge_header_definitions_save_five_copies(self):
    lines_1 = [
        '##INFO=<ID=NS,Number=1,Type=Float,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
    ]
    lines_2 = [
        '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample3\n'
    ]

    vcf_reader_1 = vcf.Reader(fsock=iter(lines_1))
    vcf_reader_2 = vcf.Reader(fsock=iter(lines_2))
    file_names = ['file1', 'file2', 'file3', 'file4', 'file5', 'file6']
    headers = []
    for file_name in file_names:
      headers.append(self._get_vcf_header_from_reader(vcf_reader_1, file_name))
    headers.append(self._get_vcf_header_from_reader(vcf_reader_2, 'file7'))

    pipeline = TestPipeline()
    merged_definitions = (
        pipeline
        | Create(headers)
        | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

    expected = VcfHeaderDefinitions()
    expected._infos = {
        'NS': {Definition(1, 'Float'):
                   ['file1', 'file2', 'file3', 'file4', 'file5'],
               Definition(1, 'Integer'): ['file7']}}
    assert_that(merged_definitions, equal_to([expected]))
    pipeline.run()
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    all_patterns = known_args.all_patterns
    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    with beam.Pipeline(options=options) as p:
        headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns)
        merged_headers = pipeline_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            if len(all_patterns) == 1:
                variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                    all_patterns[0], allow_malformed_records=True)
            else:
                variants = (p
                            | 'InputFilePattern' >> beam.Create(all_patterns)
                            | 'ReadAllFromVcf' >>
                            vcfio.ReadAllFromVcf(allow_malformed_records=True))

            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            pipeline_common.write_headers(merged_headers,
                                          known_args.resolved_headers_path)
  def test_merge_header_definitions_one_header(self):
    lines = [
        '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number samples">\n',
        '#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  Sample1 Sample2\n'
    ]

    vcf_reader = vcf.Reader(fsock=iter(lines))
    headers = self._get_vcf_header_from_reader(vcf_reader, 'file1')
    pipeline = TestPipeline()
    merged_definitions = (
        pipeline
        | Create([headers])
        | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())

    expected = VcfHeaderDefinitions()
    expected._infos = {'NS': {Definition(1, 'Integer'): ['file1']}}
    assert_that(merged_definitions, equal_to([expected]))
    pipeline.run()