def test_vcf_header_to_schema_to_vcf_header(self): infos = OrderedDict([ ('I1', createInfo('I1', '.', 'String', 'desc', None, None)), ('IA', createInfo('IA', '.', 'Integer', 'desc', None, None)) ]) formats = OrderedDict([('F1', createFormat('F1', '.', 'String', 'desc')), ('F2', createFormat('F2', '.', 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc'))]) original_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) schema = schema_converter.generate_schema_from_header_fields( original_header, processed_variant.ProcessedVariantFactory(original_header)) reconstructed_header = ( schema_converter.generate_header_fields_from_schema(schema)) self.assertEqual(original_header, reconstructed_header)
def test_generate_header_fields_from_schema_schema_compatibility(self): schema_conflict = bigquery.TableSchema() schema_conflict.fields.append( bigquery.TableFieldSchema( name='AA', type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='desc')) with self.assertRaises(ValueError): schema_converter.generate_header_fields_from_schema( schema_conflict) header = schema_converter.generate_header_fields_from_schema( schema_conflict, allow_incompatible_schema=True) infos = OrderedDict([('AA', createInfo('AA', 1, 'Integer', 'desc', None, None))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=OrderedDict()) self.assertEqual(header, expected_header)
def test_generate_header_fields_from_schema_with_annotation(self): sample_schema = bigquery_schema_util.get_sample_table_schema( with_annotation_fields=True) header = schema_converter.generate_header_fields_from_schema( sample_schema) infos = OrderedDict([ ('AF', createInfo('AF', 'A', 'Float', 'desc', None, None)), ('CSQ', createInfo('CSQ', '.', 'String', 'desc Format: Consequence|IMPACT', None, None)), ('AA', createInfo('AA', 1, 'String', 'desc', None, None)), ('IFR', createInfo('IFR', '.', 'Float', 'desc', None, None)), ('IS', createInfo('IS', 1, 'String', 'desc', None, None)) ]) formats = OrderedDict([('FB', createFormat('FB', 1, 'String', 'desc')), ('GQ', createFormat('GQ', 1, 'Integer', 'desc'))]) expected_header = vcf_header_io.VcfHeader(infos=infos, formats=formats) self.assertEqual(header, expected_header)
def test_combine_pipeline(self): headers_1 = self._get_header_from_lines(FILE_1_LINES) headers_2 = self._get_header_from_lines(FILE_2_LINES) # TODO(nmousavi): Either use TestPipeline or combiner_fn.* everywhere. # After moving out _HeaderMerger to its file, it makes sense to use # TestPipeline everywhere. header_merger = HeaderMerger( vcf_field_conflict_resolver.FieldConflictResolver( split_alternate_allele_info_fields=True)) expected = vcf_header_io.VcfHeader() header_merger.merge(expected, headers_1) header_merger.merge(expected, headers_2) pipeline = TestPipeline() merged_headers = ( pipeline | Create([headers_1, headers_2]) | 'MergeHeaders' >> merge_headers.MergeHeaders()) assert_that(merged_headers, equal_to([expected]))
def test_infer_annotation_types_with_missing(self): anno_fields = ['CSQ'] header = self._get_sample_header_fields(with_annotation=True) variant = self._get_sample_variant_1() variant.info['CSQ'] = ['A||100|', 'A||101|1.3', 'A|||1.4', 'TT|||'] infer_header_fields = infer_headers._InferHeaderFields( False, anno_fields) inferred_headers = next(infer_header_fields.process(variant, header)) expected_types = { 'CSQ_Gene_TYPE': '.', 'CSQ_Position_TYPE': 'Integer', 'CSQ_Score_TYPE': 'Float' } for key, item in inferred_headers.infos.items(): self.assertEqual(item['type'], expected_types[key]) self.assertEqual(len(expected_types), len(inferred_headers.infos)) variant.info['CSQ'] = [] inferred_headers = next(infer_header_fields.process(variant, header)) expected = vcf_header_io.VcfHeader() self.assertEqual(expected, inferred_headers)
def test_defined_fields_filtered_two_variants(self): # Only INFO and FORMAT in the first variants are already defined in the # header section of the VCF files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers]) variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(vcf_headers_side_input))) expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')} expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def test_bigquery_field_name_sanitize(self): infos = OrderedDict([ ('_', createInfo('_', 1, 'String', 'desc', 'src', 'v')), ('_A', createInfo('_A', 1, 'String', 'desc', 'src', 'v')), ('0a', createInfo('0a', 1, 'String', 'desc', 'src', 'v')), ('A-B*C', createInfo('A-B*C', 1, 'String', 'desc', 'src', 'v')), ('I-A', createInfo('I-A', 'A', 'Float', 'desc', 'src', 'v')), ('OK_info_09', createInfo('OK_info_09', 1, 'String', 'desc'))]) formats = OrderedDict([ ('a^b', createFormat('a^b', 1, 'String', 'desc')), ('OK_format_09', createFormat('OK_format_09', 1, 'String', 'desc'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['I_A'], call_fields=['a_b', 'OK_format_09'], info_fields=['field__', 'field__A', 'field_0a', 'A_B_C', 'OK_info_09']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_info_and_format_header_fields(self): infos = OrderedDict([ ('I1', createInfo('I1', 1, 'String', 'desc', 'src', 'v')), ('IA', createInfo('IA', 'A', 'Integer', 'desc', 'src', 'v'))]) # GT and PS should not be set as they're already included in special # 'genotype' and 'phaseset' fields. formats = OrderedDict([ ('F1', createFormat('F1', 1, 'String', 'desc')), ('F2', createFormat('F2', 2, 'Integer', 'desc')), ('FU', createFormat('FU', '.', 'Float', 'desc')), ('GT', createFormat('GT', 2, 'Integer', 'Special GT key')), ('PS', createFormat('PS', 1, 'Integer', 'Special PS key'))]) header_fields = vcf_header_io.VcfHeader(infos=infos, formats=formats) self._validate_schema( self._generate_expected_fields( alt_fields=['IA'], call_fields=['F1', 'F2', 'FU'], info_fields=['I1']), schema_converter.generate_schema_from_header_fields( header_fields, processed_variant.ProcessedVariantFactory(header_fields)))
def test_header_fields_inferred_one_variant(self): with TestPipeline() as p: variant = self._get_sample_variant_1() inferred_headers = ( p | Create([variant]) | 'InferHeaderFields' >> infer_headers.InferHeaderFields(defined_headers=None)) expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'ISI': Info('ISI', 1, 'Integer', '', '', ''), 'ISF': Info('ISF', 1, 'Float', '', '', ''), 'IF': Info('IF', 1, 'Float', '', '', ''), 'IB': Info('IB', 0, 'Flag', '', '', ''), 'IA': Info('IA', None, 'Integer', '', '', '')} expected_formats = {'FI': Format('FI', 1, 'Integer', ''), 'FU': Format('FU', None, 'Float', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def test_header_fields_inferred_from_two_variants(self): with TestPipeline() as p: variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( defined_headers=None)) expected_infos = {'IS': Info('IS', 1, 'String', '', '', ''), 'IF': Info('IF', 0, 'Flag', '', '', ''), 'IA': Info('IA', None, 'String', '', '', ''), 'IS_2': Info('IS_2', 1, 'String', '', '', '')} expected_formats = {'FI': Format('FI', 1, 'String', ''), 'FU': Format('FU', None, 'String', ''), 'FI_2': Format('FI_2', 1, 'String', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def _get_sample_variant_and_header_with_csq(self, additional_infos=None): """Provides a simple `Variant` and `VcfHeader` with info fields Args: additional_infos: A list of tuples of the format (key, `Info`) to be added to the `VcfHeader`. """ # type: ( variant = self._get_sample_variant() variant.info['CSQ'] = ['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'] infos = OrderedDict([ ('A1', Info('A1', 1, None, '', None, None)), ('A2', Info('A2', parser.field_counts['A'], None, '', None, None)), ('CSQ', Info('CSQ', parser.field_counts['.'], None, 'some desc Allele|Consequence|IMPACT|SYMBOL|Gene', None, None))]) if additional_infos is not None: for key, value in additional_infos: infos[key] = value header_fields = vcf_header_io.VcfHeader(infos=infos) return variant, header_fields
def make_header(header_num_dict): # type: (Dict[str, str]) -> VcfHeader """Builds a VcfHeader based on the header_num_dict. All fields of parser._Info are set to their default values except for the 'id' which is set to the keys in header_num_dict and 'num' which is set based on header_num_dict values mapped according to parser.field_counts. Args: header_num_dict: a dictionary mapping info keys to string num values. """ infos = {} for k, v in header_num_dict.iteritems(): if v in parser.field_counts: pyvcf_num_field_value = parser.field_counts[v] else: pyvcf_num_field_value = int(v) infos[k] = parser._Info(id=k, num=pyvcf_num_field_value, type=None, desc='', source=None, version=None) return vcf_header_io.VcfHeader(infos=infos)
def test_create_processed_variant_annotation_alt_allele_num(self): csq_info = createInfo(None, '.', '.', 'some desc Allele|Consequence|IMPACT|ALLELE_NUM', source=None, version=None) header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info}) variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', # The following represent a SNV and an insertion, resp. alternate_bases=['T', 'CT'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation # ALT because it can map to either the 'T' SNV or the 'CT' insertion. # But because there is ALLELE_NUM there should be no ambiguity. # The last four annotations have incorrect ALLELE_NUMs. info={ 'CSQ': [ 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3', 'T|C5|I5|TEST', 'T|C6|I6|' ] }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], use_allele_num=True, minimal_match=True, # This should be ignored by the factory method. counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('T') alt1._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1' }] } alt2 = processed_variant.AlternateBaseData('CT') alt2._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
def create_accumulator(self): # type: () -> vcf_header_io.VcfHeader return vcf_header_io.VcfHeader()
def create_accumulator(self): return vcf_header_io.VcfHeader()