def test_header_fields_inferred_one_variant(self): with TestPipeline() as p: variant = self._get_sample_variant_1() inferred_headers = ( p | Create([variant]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=None, infer_headers=True)) expected_infos = { 'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', None, 'Integer', '', '', '') } expected_formats = { 'FI': Format('FI', 1, 'Integer', ''), 'FU': Format('FU', None, 'Float', '') } expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', Info('_', 1, 'String', 'desc', 'src', 'v')), ('_A', Info('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', Info('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', Info('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', Info('I-A', field_counts['A'], 'Float', 'desc', 'src', 'v')), ('OK_info_09', Format('OK_info_09', 1, 'String', 'desc')) ]) formats = OrderedDict([('a^b', Format('a^b', 1, 'String', 'desc')), ('OK_format_09', Format('OK_format_09', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=[ 'field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09' ]), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def _get_sample_header_fields(self, with_annotation=False): """Provides a simple `VcfHeader` with info and format fields Args: with_annotation: Can be bool or list of tuples. Tuples should be additional annotation fields in the format (key, `Info`). """ infos = OrderedDict([ ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('ISI', Info('ISI', 1, 'Int', 'desc', 'src', 'v')), ('ISF', Info('ISF', 1, 'Float', 'desc', 'src', 'v')), ('IF', Info('IF', 1, 'Float', 'desc', 'src', 'v')), ('IB', Info('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v')) ]) if with_annotation: infos['CSQ'] = Info( 'CSQ', field_counts['.'], 'String', 'Annotations from VEP. Format: Allele|Gene|Position|Score', 'src', 'v') if isinstance(with_annotation, list): for key, value in with_annotation: infos[key] = value formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key')) ]) return vcf_header_io.VcfHeader(infos=infos, formats=formats)
def test_header_fields_inferred_from_two_variants(self): with TestPipeline() as p: variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( defined_headers=None)) expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', None, 'Float', '', '', ''), 'IS_2': Info('IS_2', 1, 'String', '', '', '')} expected_formats = {'FI': Format('FI', 1, 'Integer', ''), 'FU': Format('FU', None, 'Float', ''), 'FI_2': Format('FI_2', 1, 'Integer', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def _get_sample_header_fields(self): infos = OrderedDict([ ('IS', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IF', Info('I1', 1, 'Flag', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) return vcf_header_io.VcfHeader(infos=infos, formats=formats)
def test_infer_format_fields_no_conflicts(self): variant = self._get_sample_variant_1() formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) infer_header_fields = infer_headers._InferHeaderFields() header = infer_header_fields._infer_format_fields( variant, vcf_header_io.VcfHeader(formats=formats)) self.assertEqual({}, header)
def test_report_conflicted_and_inferred_headers(self): header_definitions = merge_header_definitions.VcfHeaderDefinitions() header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1'], Definition(1, 'Integer'): ['file2']}} infos = OrderedDict([ ('NS', Info('NS', 1, 'Float', 'Number samples', None, None))]) formats = OrderedDict([ ('DP', Format('DP', 2, 'Float', 'Total Depth'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) inferred_headers = VcfHeader(formats=formats) expected = [ preprocess_reporter._InconsistencyType.HEADER_CONFLICTS + '\n', preprocess_reporter._HeaderLine.CONFLICTS_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'NS', 'INFO', 'num=1 type=Float', 'file1', 'num=1 type=Float\n']), (preprocess_reporter._DELIMITER).join([ ' ', ' ', 'num=1 type=Integer', 'file2', ' \n']), '\n', preprocess_reporter._InconsistencyType.UNDEFINED_HEADERS + '\n', preprocess_reporter._HeaderLine.UNDEFINED_FIELD_HEADER + '\n', (preprocess_reporter._DELIMITER).join([ 'DP', 'FORMAT', 'num=2 type=Float\n']), '\n' ] self._generate_report_and_assert_contents_equal(expected, header_definitions, resolved_headers, inferred_headers)
def _infer_mismatched_format_field(self, field_key, # type: str field_value, # type: Any defined_header # type: Dict ): # type: (...) -> Optional[Format] """Returns corrected format if there are mismatches. One type of mismatches is handled: - Defined type is `Integer`, but the provided value is float. Correct the type to be `Float`. Args: field_key: the format field key. field_value: the value of the field key given in the variant. defined_header: The definition of `field_key` in the header. Returns: Corrected format definition if there are mismatches. """ corrected_type = self._get_corrected_type( defined_header.get(_HeaderKeyConstants.TYPE), field_value) if corrected_type != defined_header.get(_HeaderKeyConstants.TYPE): return Format(field_key, defined_header.get(_HeaderKeyConstants.NUM), corrected_type, defined_header.get(_HeaderKeyConstants.DESC)) return None
def _infer_undefined_format_fields(self, variant, defined_headers): """Returns format fields not defined in the headers. Args: variant (:class:`vcfio.Variant`): variant obj. defined_headers (:class:`vcf_header_io.VcfHeader`): header fields defined in header section of VCF files. Returns: A dict of (format_key(str), :class:`Format`) for any format key in `variant` that is not defined in the header. """ formats = {} for call in variant.calls: for format_key, format_value in call.info.iteritems(): if not defined_headers or format_key not in defined_headers.formats: if format_key in formats: raise ValueError( 'Invalid VCF file. Duplicate FORMAT field in variant {}' .format(variant)) formats[format_key] = Format( format_key, self._get_field_count(format_value), self._get_field_type(format_value), '') # NO_DESCRIPTION # No point in proceeding. All other calls have the same FORMAT. break return formats
def test_pipeline(self): infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', -1, 'Integer', '', '', '')} formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) with TestPipeline() as p: variant_1 = self._get_sample_variant_info_ia_cardinality_mismatch() variant_2 = self._get_sample_variant_format_fi_float_value() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( defined_headers=vcf_header_io.VcfHeader(infos=infos, formats=formats), allow_incompatible_records=True)) expected_infos = {'IA': Info('IA', None, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', '')} expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'), 'FU': Format('FU', None, 'Float', '')} expected = vcf_header_io.VcfHeader(infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, asserts.header_fields_equal_ignore_order([expected])) p.run()
def test_info_and_format_header_fields(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', Format('F1', 1, 'String', 'desc')), ('F2', Format('F2', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key')) ]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields(alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_report_no_conflicts(self): header_definitions = merge_header_definitions.VcfHeaderDefinitions() header_definitions._infos = {'NS': {Definition(1, 'Float'): ['file1']}} header_definitions._formats = {'NS': {Definition(1, 'Float'): ['file2']}} infos = OrderedDict([ ('NS', Info('NS', 1, 'Integer', 'Number samples', None, None))]) formats = OrderedDict([('NS', Format('NS', 1, 'Float', 'Number samples'))]) resolved_headers = VcfHeader(infos=infos, formats=formats) expected = ['No Header Conflicts Found.\n', '\n'] self._generate_report_and_assert_contents_equal(expected, header_definitions, resolved_headers)
def infer_format_fields(variant, defined_headers): # type: (vcfio.Variant, vcf_header_io.VcfHeader) -> Dict[str, Format] """Returns inferred format fields. Two types of format fields are inferred: - The format fields are undefined in the headers. - The format definition provided by the headers does not match the field values. Args: variant: variant object defined_headers: header fields defined in header section of VCF files. Returns: A dict of (format_key, `Format`) for any format key in `variant` that is not defined in the header or the definition mismatches the field values. """ formats = {} for call in variant.calls: for format_key, format_value in call.info.iteritems(): if not defined_headers or format_key not in defined_headers.formats: if format_key in formats: raise ValueError( 'Duplicate FORMAT field "{}" in variant "{}"'.format( format_key, variant)) logging.warning('Undefined FORMAT field "%s" in variant "%s"', format_key, str(variant)) formats[format_key] = Format(format_key, _get_field_count(format_value), _get_field_type(format_value), '') # NO_DESCRIPTION # No point in proceeding. All other calls have the same FORMAT. break for call in variant.calls: for format_key, format_value in call.info.iteritems(): if defined_headers and format_key in defined_headers.formats: defined_header = defined_headers.formats.get(format_key) corrected_format = _infer_mismatched_format_field( format_key, format_value, defined_header) if corrected_format: logging.warning( 'Incorrect FORMAT field "%s". Defined as "type=%s,num=%s", ' 'got "%s" in variant "%s"', format_key, defined_header.get(_HeaderKeyConstants.TYPE), str(defined_header.get(_HeaderKeyConstants.NUM)), str(format_value), str(variant)) formats[format_key] = corrected_format return formats
def test_report_inferred_headers_only(self): header_definitions = merge_header_definitions.VcfHeaderDefinitions() formats = OrderedDict([('DP', Format('DP', 2, 'Float', 'Total Depth'))]) inferred_headers = VcfHeader(formats=formats) expected = [ 'No Header Conflicts Found.\n', '\n', preprocess_reporter._InconsistencyType.UNDEFINED_HEADERS + '\n', preprocess_reporter._HeaderLine.UNDEFINED_FIELD_HEADER + '\n', (preprocess_reporter._DELIMITER).join( ['DP', 'FORMAT', 'num=2 type=Float\n']), '\n' ] self._generate_report_and_assert_contents_equal( expected, header_definitions, inferred_headers=inferred_headers)
def test_variant_merger_modify_schema(self): infos = OrderedDict([('I1', Info('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', Info('IA', field_counts['A'], 'Integer', 'desc', 'src', 'v'))]) formats = OrderedDict([('F1', Format('F1', 1, 'String', 'desc'))]) header_fields = vcf_header_parser.HeaderFields(infos, formats) self._assert_fields_equal( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1'], info_fields=['I1', 'ADDED_BY_MERGER']), bigquery_vcf_schema.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields), variant_merger=_DummyVariantMergeStrategy()))
def test_infer_format_fields_combined_conflicts(self): variant = self._get_sample_variant_format_fi_float_value() formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) infer_header_fields = infer_headers._InferHeaderFields() inferred_formats = infer_header_fields._infer_format_fields( variant, vcf_header_io.VcfHeader(formats=formats)) expected_formats = {'FI': Format('FI', 2, 'Float', 'desc'), 'FU': Format('FU', field_counts['.'], 'Float', '')} self.assertEqual(expected_formats, inferred_formats)
def test_infer_mismatched_format_field(self): variant = self._get_sample_variant_format_fi_float_value() formats = OrderedDict([ ('FS', Format('FS', 1, 'String', 'desc')), ('FI', Format('FI', 2, 'Integer', 'desc')), ('FU', Format('FU', field_counts['.'], 'Float', 'desc')), ('GT', Format('GT', 2, 'Integer', 'Special GT key')), ('PS', Format('PS', 1, 'Integer', 'Special PS key'))]) infer_header_fields = infer_headers._InferHeaderFields() corrected_format = infer_header_fields._infer_mismatched_format_field( 'FI', variant.calls[0].info.get('FI'), vcf_header_io.VcfHeader(formats=formats).formats.get('FI')) expected_formats = Format('FI', 2, 'Float', 'desc') self.assertEqual(expected_formats, corrected_format)
def test_defined_fields_filtered_two_variants(self): # Only INFO and FORMAT in the first variants are already defined in the # header section of the VCF files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers]) variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(vcf_headers_side_input))) expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')} expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()