def test_create_processed_variant_no_change(self): variant = self._get_sample_variant() header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=False, counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) # In this mode, the only difference between the original `variant` and # `proc_var` should be that INFO fields are copied to `_non_alt_info` map # and `_alternate_datas` are filled with alternate bases information only. proc_var_synthetic = processed_variant.ProcessedVariant(variant) proc_var_synthetic._non_alt_info = { 'A1': 'some data', 'A2': ['data1', 'data2'] } proc_var_synthetic._alternate_datas = [ processed_variant.AlternateBaseData(a) for a in ['A', 'TT'] ] self.assertEqual([proc_var_synthetic], [proc_var]) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
def test_create_processed_variant_move_alt_info_extra_values(self): header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) variant = self._get_sample_variant() # Add a value to `A2` (it only has two alternate bases, so this is invalid). variant.info['A2'] = ['data1', 'data2', 'data3'] # Ensure error is raised by default. factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True) with self.assertRaises(ValueError): _ = factory.create_processed_variant(variant) # Try again with allow_alternate_allele_info_mismatch=True. factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, allow_alternate_allele_info_mismatch=True) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('A') alt1._info = {'A2': 'data1'} alt2 = processed_variant.AlternateBaseData('TT') alt2._info = {'A2': 'data2'} self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2'))
def _get_processed_variant(variant, header_num_dict=None): # TODO(bashir2): To make this more of a "unit" test, we should create # ProcessedVariant instances directly (instead of Variant) and avoid calling # create_processed_variant here. Then we should also add cases that # have annotation fields. header_fields = vcf_header_util.make_header(header_num_dict or {}) return processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant)
def test_create_processed_variant_move_alt_info(self): variant = self._get_sample_variant() header_fields = vcf_header_util.make_header({'A1': '1', 'A2': 'A'}) factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('A') alt1._info = {'A2': 'data1'} alt2 = processed_variant.AlternateBaseData('TT') alt2._info = {'A2': 'data2'} self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2'))
def test_convert_variant_to_bigquery_row_omit_empty_calls(self): variant, row, header_num_dict = self._get_sample_variant_with_empty_calls() header_fields = vcf_header_util.make_header(header_num_dict) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) pipeline = TestPipeline(blocking=True) bigquery_rows = ( pipeline | Create([proc_var]) | 'ConvertToRow' >> beam.ParDo(ConvertVariantToRow( self._row_generator, omit_empty_sample_calls=True))) assert_that(bigquery_rows, equal_to([row])) pipeline.run()
def _get_row_list_from_variant( self, variant, header_num_dict=None, allow_incompatible_records=False, omit_empty_sample_calls=False, **kwargs): # TODO(bashir2): To make this more of a "unit" test, we should create # ProcessedVariant instances directly (instead of Variant) and avoid calling # create_processed_variant here. Then we should also add cases that # have annotation fields. header_fields = vcf_header_util.make_header(header_num_dict or {}) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) return list(self._row_generator.get_rows( proc_var, allow_incompatible_records, omit_empty_sample_calls, **kwargs))
def _get_sample_variant_and_header_with_csq(self): variant = self._get_sample_variant() variant.info['CSQ'] = [ 'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3' ] header_fields = vcf_header_util.make_header({ 'CSQ': '.', 'A1': '1', 'A2': 'A' }) header_fields.infos['CSQ'][ vcf_header_io.VcfParserHeaderKeyConstants.DESC] = ( 'some desc Allele|Consequence|IMPACT|SYMBOL|Gene') return variant, header_fields
def test_convert_variant_to_bigquery_row_allow_incompatible_recoreds(self): variant, row, header_num_dict = ( self._get_sample_variant_with_incompatible_records()) header_fields = vcf_header_util.make_header(header_num_dict) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) pipeline = TestPipeline(blocking=True) bigquery_rows = ( pipeline | Create([proc_var]) | 'ConvertToRow' >> ParDo(ConvertToBigQueryTableRow( self._row_generator, allow_incompatible_records=True))) assert_that(bigquery_rows, equal_to([row])) pipeline.run()
def test_convert_variant_with_sample_name_to_bigquery_row(self): self._row_generator = bigquery_row_generator.VariantCallRowGenerator( self._schema_descriptor, self._conflict_resolver, include_call_name=True) variant, row, header_num_dict = self._get_sample_variant_with_sample_name( ) header_fields = vcf_header_util.make_header(header_num_dict) proc_var = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant) pipeline = TestPipeline(blocking=True) bigquery_rows = ( pipeline | Create([proc_var]) | 'ConvertToRow' >> beam.ParDo( ConvertVariantToRow(self._row_generator, omit_empty_sample_calls=True))) assert_that(bigquery_rows, equal_to([row])) pipeline.run()
def test_convert_variant_to_bigquery_row(self): variant_1, row_1, header_num_dict_1 = self._get_sample_variant_1() variant_2, row_2, header_num_dict_2 = self._get_sample_variant_2() variant_3, row_3, header_num_dict_3 = self._get_sample_variant_3() header_num_dict = header_num_dict_1.copy() header_num_dict.update(header_num_dict_2) header_num_dict.update(header_num_dict_3) header_fields = vcf_header_util.make_header(header_num_dict) proc_var_1 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_1) proc_var_2 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_2) proc_var_3 = processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant_3) pipeline = TestPipeline(blocking=True) bigquery_rows = (pipeline | Create([proc_var_1, proc_var_2, proc_var_3]) | 'ConvertToRow' >> beam.ParDo( ConvertVariantToRow(self._row_generator))) assert_that(bigquery_rows, equal_to([row_1, row_2, row_3])) pipeline.run()
def _get_processed_variant(variant, header_num_dict=None): header_fields = vcf_header_util.make_header(header_num_dict or {}) return processed_variant.ProcessedVariantFactory( header_fields).create_processed_variant(variant)