Exemplo n.º 1
0
 def test_create_processed_variant_annotation_alt_allele_num(self):
     csq_info = parser._Info(
         id=None,
         num='.',
         type=None,
         desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM',
         source=None,
         version=None)
     header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info})
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         # The following represent a SNV and an insertion, resp.
         alternate_bases=['T', 'CT'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation
         # ALT because it can map to either the 'T' SNV or the 'CT' insertion.
         # But because there is ALLELE_NUM there should be no ambiguity.
         # The last four annotations have incorrect ALLELE_NUMs.
         info={
             'CSQ':
             vcfio.VariantInfo(data=[
                 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3',
                 'T|C5|I5|TEST', 'T|C6|I6|'
             ],
                               field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         use_allele_num=True,
         minimal_match=True,  # This should be ignored by the factory method.
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('T')
     alt1._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'ALLELE_NUM': '1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('CT')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'ALLELE_NUM': '2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
Exemplo n.º 2
0
 def test_create_processed_variant_annotation_alt_long_prefix(self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(reference_name='19',
                             start=11,
                             end=12,
                             reference_bases='CC',
                             alternate_bases=['CCT', 'CCC', 'CCCC'],
                             names=['rs1'],
                             quality=2,
                             filters=['PASS'],
                             info={
                                 'CSQ':
                                 vcfio.VariantInfo(data=[
                                     'CT|C1|I1|S1|G1', 'CC|C2|I2|S2|G2',
                                     'CCC|C3|I3|S3|G3'
                                 ],
                                                   field_count='.')
                             })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('CCT')
     alt1._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'CT',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('CCC')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'CC',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     alt3 = processed_variant.AlternateBaseData('CCCC')
     alt3._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'CCC',
             'Consequence': 'C3',
             'IMPACT': 'I3',
             'SYMBOL': 'S3',
             'Gene': 'G3'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
Exemplo n.º 3
0
 def test_create_processed_variant_annotation_alt_minimal(self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='CC',
         # The following represent a SNV, an insertion, and a deletion, resp.
         alternate_bases=['CT', 'CCT', 'C'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         # Note that in the minimal mode, 'T' is an ambiguous annotation ALT
         # because it can map to either the 'CT' SNV or the 'CCT' insertion.
         # It is not ambiguous in the non-minimal mode (it only maps to `CT`).
         info={
             'CSQ':
             vcfio.VariantInfo(data=['T|C1|I1|S1|G1', '-|C2|I2|S2|G2'],
                               field_count='.')
         })
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         minimal_match=True,
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('CT')
     alt1._info = {}
     alt2 = processed_variant.AlternateBaseData('CCT')
     alt2._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: 'T',
             processed_variant._ANNOTATION_ALT_AMBIGUOUS: True,
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt3 = processed_variant.AlternateBaseData('C')
     alt3._info = {
         'CSQ': [{
             processed_variant._ANNOTATION_ALT: '-',
             processed_variant._ANNOTATION_ALT_AMBIGUOUS: False,
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 2)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1)
Exemplo n.º 4
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)

    if known_args.auto_flags_experiment:
        _get_input_dimensions(known_args, pipeline_args)

    annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

    all_patterns = ([annotated_vcf_pattern]
                    if annotated_vcf_pattern else known_args.all_patterns)

    variant_merger = _get_variant_merge_strategy(known_args)

    pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    avro_root_path = _get_avro_root_path(beam_pipeline_options)
    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode, avro_root_path,
                   annotated_vcf_pattern)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    schema = schema_converter.generate_schema_from_header_fields(
        header_fields, processed_variant_factory, variant_merger,
        known_args.use_1_based_coordinate, known_args.include_call_name)

    sharding = variant_sharding.VariantSharding(
        known_args.sharding_config_path)
    if sharding.should_keep_shard(sharding.get_residual_index()):
        num_shards = sharding.get_num_shards()
    else:
        num_shards = sharding.get_num_shards() - 1

    if known_args.update_schema_on_append:
        for i in range(num_shards):
            table_suffix = sharding.get_output_table_suffix(i)
            table_name = bigquery_util.compose_table_name(
                known_args.output_table, table_suffix)
            bigquery_util.update_bigquery_schema_on_append(
                schema.fields, table_name)

    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(
        all_patterns,
        pipeline,
        known_args,
        pipeline_mode,
        use_1_based_coordinate=known_args.use_1_based_coordinate)
    if known_args.allow_malformed_records:
        variants |= 'DropMalformedRecords' >> filter_variants.FilterVariants()
    sharded_variants = variants | 'ShardVariants' >> beam.Partition(
        shard_variants.ShardVariants(sharding), sharding.get_num_shards())
    variants = []
    for i in range(num_shards):
        suffix = sharding.get_output_table_suffix(i)
        # Convert tuples to list
        variants.append(sharded_variants[i])
        if variant_merger:
            variants[i] |= ('MergeVariants' + suffix >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + suffix >>
            beam.Map(processed_variant_factory.create_processed_variant). \
            with_output_types(processed_variant.ProcessedVariant))
        _ = (variants[i]
             | 'VariantToAvro' + suffix >> variant_to_avro.VariantToAvroFiles(
                 avro_root_path + suffix,
                 schema,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement),
                 include_call_name=known_args.include_call_name))
    result = pipeline.run()
    try:
        state = result.wait_until_finish()
        if state != beam.runners.runner.PipelineState.DONE:
            logging.error(
                'Dataflow pipeline terminated in an unexpected state: %s',
                state)
            raise AssertionError(
                'Dataflow pipeline terminated in {} state'.format(state))
    except Exception as e:
        logging.error('Dataflow pipeline failed.')
        raise e
    else:
        logging.info('Dataflow pipeline finished successfully.')
        metrics_util.log_all_counters(result)

    # After pipeline is done, create output tables and load AVRO files into them.
    schema_file = _write_schema_to_temp_file(schema, avro_root_path)
    suffixes = []
    try:
        for i in range(num_shards):
            suffixes.append(sharding.get_output_table_suffix(i))
            partition_range_end = sharding.get_output_table_partition_range_end(
                i)
            if not known_args.append:
                table_name = bigquery_util.compose_table_name(
                    known_args.output_table, suffixes[i])
                partitioning.create_bq_table(
                    table_name, schema_file,
                    bigquery_util.ColumnKeyConstants.START_POSITION,
                    partition_range_end)
                _record_newly_created_table(table_name)
                logging.info('Integer range partitioned table %s was created.',
                             table_name)
        if not known_args.append:
            _record_newly_created_table(
                sample_info_table_schema_generator.create_sample_info_table(
                    known_args.output_table))

        suffixes.append(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        load_avro = avro_util.LoadAvro(avro_root_path, known_args.output_table,
                                       suffixes, False)
        not_empty_variant_suffixes = load_avro.start_loading()
        logging.info('Following tables were loaded with at least 1 row:')
        for suffix in not_empty_variant_suffixes:
            logging.info(
                bigquery_util.compose_table_name(known_args.output_table,
                                                 suffix))
        # Remove sample_info table from both lists to avoid duplicating it when
        # --sample_lookup_optimized_output_table flag is set
        suffixes.remove(
            sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
        if sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX in\
            not_empty_variant_suffixes:
            not_empty_variant_suffixes.remove(
                sample_info_table_schema_generator.SAMPLE_INFO_TABLE_SUFFIX)
    except Exception as e:
        logging.error(
            'Something unexpected happened during the loading of AVRO '
            'files to BigQuery: %s', str(e))
        logging.info(
            'Since the write to BigQuery stage failed, we did not delete '
            'AVRO files in your GCS bucket. You can manually import them '
            'to BigQuery. To avoid extra storage charges, delete them if '
            'you do not need them, AVRO files are located at: %s',
            avro_root_path)
        raise e
    else:
        logging.warning('All AVRO files were successfully loaded to BigQuery.')
        if known_args.keep_intermediate_avro_files:
            logging.info(
                'Since "--keep_intermediate_avro_files" flag is set, the '
                'AVRO files are kept and stored at: %s', avro_root_path)
        else:
            if bigquery_util.delete_gcs_files(avro_root_path) != 0:
                logging.error(
                    'Deletion of intermediate AVRO files located at "%s" has '
                    'failed.', avro_root_path)

    if known_args.sample_lookup_optimized_output_table:
        flatten_call_column = partitioning.FlattenCallColumn(
            known_args.output_table, not_empty_variant_suffixes,
            known_args.append)
        try:
            flatten_schema_file = tempfile.mkstemp(
                suffix=_BQ_SCHEMA_FILE_SUFFIX)[1]
            if not flatten_call_column.get_flatten_table_schema(
                    flatten_schema_file):
                raise ValueError('Failed to extract schema of flatten table')
            # Create output flatten tables if needed
            if not known_args.append:
                # Create all sample optimized tables including those that will be empty.
                for suffix in suffixes:
                    output_table_id = bigquery_util.compose_table_name(
                        known_args.sample_lookup_optimized_output_table,
                        suffix)
                    partitioning.create_bq_table(
                        output_table_id, flatten_schema_file,
                        bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                        partitioning.MAX_RANGE_END)
                    _record_newly_created_table(output_table_id)
                    logging.info(
                        'Sample lookup optimized table %s was created.',
                        output_table_id)
            # Copy to flatten sample lookup tables from the variant lookup tables.
            # Note: uses WRITE_TRUNCATE to overwrite the existing tables (issue #607).
            flatten_call_column.copy_to_flatten_table(
                known_args.sample_lookup_optimized_output_table)
            logging.info(
                'All sample lookup optimized tables are fully loaded.')
        except Exception as e:
            logging.error(
                'Something unexpected happened during the loading rows to '
                'sample optimized table stage: %s', str(e))
            raise e
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = pipeline_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)

    annotated_vcf_pattern = _run_annotation_pipeline(known_args, pipeline_args)

    input_pattern = annotated_vcf_pattern or known_args.input_pattern
    variant_merger = _get_variant_merge_strategy(known_args)
    pipeline_mode = pipeline_common.get_pipeline_mode(
        input_pattern, known_args.optimize_for_large_inputs)
    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args.input_pattern, known_args, pipeline_args,
                   pipeline_mode, annotated_vcf_pattern)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.allow_malformed_records, known_args.annotation_fields,
        known_args.use_allele_num, known_args.minimal_vep_alt_matching,
        known_args.infer_annotation_types, counter_factory)

    partitioner = None
    if ((known_args.optimize_for_large_inputs and variant_merger)
            or known_args.partition_config_path):
        partitioner = variant_partition.VariantPartition(
            known_args.partition_config_path)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(input_pattern, pipeline, known_args,
                              pipeline_mode)
    variants |= 'FilterVariants' >> filter_variants.FilterVariants(
        reference_names=known_args.reference_names)
    if partitioner:
        num_partitions = partitioner.get_num_partitions()
        partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
            partition_variants.PartitionVariants(partitioner), num_partitions)
        variants = []
        for i in range(num_partitions):
            if partitioner.should_keep_partition(i):
                variants.append(partitioned_variants[i])
            else:
                num_partitions -= 1
    else:
        # By default we don't partition the data, so we have only 1 partition.
        num_partitions = 1
        variants = [variants]

    for i in range(num_partitions):
        if variant_merger:
            variants[i] |= ('MergeVariants' + str(i) >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVariants' + str(i) >>
            beam.Map(processed_variant_factory.create_processed_variant).\
                with_output_types(processed_variant.ProcessedVariant))
    if partitioner and partitioner.should_flatten():
        variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
        num_partitions = 1

    if known_args.output_table:
        for i in range(num_partitions):
            table_suffix = ''
            if partitioner and partitioner.get_partition_name(i):
                table_suffix = '_' + partitioner.get_partition_name(i)
            table_name = known_args.output_table + table_suffix
            _ = (
                variants[i] | 'VariantToBigQuery' + table_suffix >>
                variant_to_bigquery.VariantToBigQuery(
                    table_name,
                    header_fields,
                    variant_merger,
                    processed_variant_factory,
                    append=known_args.append,
                    update_schema_on_append=known_args.update_schema_on_append,
                    allow_incompatible_records=known_args.
                    allow_incompatible_records,
                    omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                    num_bigquery_write_shards=known_args.
                    num_bigquery_write_shards,
                    null_numeric_value_replacement=(
                        known_args.null_numeric_value_replacement)))

    if known_args.output_avro_path:
        # TODO(bashir2): Add an integration test that outputs to Avro files and
        # also imports to BigQuery. Then import those Avro outputs using the bq
        # tool and verify that the two tables are identical.
        _ = (variants | 'FlattenToOnePCollection' >> beam.Flatten()
             | 'VariantToAvro' >> variant_to_avro.VariantToAvroFiles(
                 known_args.output_avro_path,
                 header_fields,
                 processed_variant_factory,
                 variant_merger=variant_merger,
                 allow_incompatible_records=known_args.
                 allow_incompatible_records,
                 omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                 null_numeric_value_replacement=(
                     known_args.null_numeric_value_replacement)))

    result = pipeline.run()
    result.wait_until_finish()

    metrics_util.log_all_counters(result)
def _get_processed_variant(variant, header_num_dict=None):
    header_fields = vcf_header_util.make_header(header_num_dict or {})
    return processed_variant.ProcessedVariantFactory(
        header_fields).create_processed_variant(variant)
Exemplo n.º 7
0
def run(argv=None):
  # type: (List[str]) -> None
  """Runs VCF to BigQuery pipeline."""
  logging.info('Command: %s', ' '.join(argv or sys.argv))
  known_args, pipeline_args = vcf_to_bq_common.parse_args(argv,
                                                          _COMMAND_LINE_OPTIONS)
  # Note VepRunner creates new input files, so it should be run before any
  # other access to known_args.input_pattern.
  if known_args.run_annotation_pipeline:
    runner = vep_runner.create_runner_and_update_args(known_args, pipeline_args)
    runner.run_on_all_files()
    runner.wait_until_done()
    logging.info('Using VEP processed files: %s', known_args.input_pattern)

  variant_merger = _get_variant_merge_strategy(known_args)
  pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
      known_args.input_pattern, known_args.optimize_for_large_inputs)

  # Starts a pipeline to merge VCF headers in beam if the total files that
  # match the input pattern exceeds _SMALL_DATA_THRESHOLD
  _merge_headers(known_args, pipeline_args, pipeline_mode)

  # Retrieve merged headers prior to launching the pipeline. This is needed
  # since the BigQuery schema cannot yet be dynamically created based on input.
  # See https://issues.apache.org/jira/browse/BEAM-2801.
  header_fields = vcf_header_parser.get_vcf_headers(
      known_args.representative_header_file)
  counter_factory = metrics_util.CounterFactory()
  processed_variant_factory = processed_variant.ProcessedVariantFactory(
      header_fields,
      known_args.split_alternate_allele_info_fields,
      known_args.annotation_fields,
      known_args.use_allele_num,
      known_args.minimal_vep_alt_matching,
      counter_factory)

  partitioner = None
  if known_args.optimize_for_large_inputs or known_args.partition_config_path:
    partitioner = variant_partition.VariantPartition(
        known_args.partition_config_path)

  beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
  pipeline = beam.Pipeline(options=beam_pipeline_options)
  variants = _read_variants(pipeline, known_args)
  variants |= 'FilterVariants' >> filter_variants.FilterVariants(
      reference_names=known_args.reference_names)
  if partitioner:
    num_partitions = partitioner.get_num_partitions()
    partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
        partition_variants.PartitionVariants(partitioner), num_partitions)
    variants = []
    for i in range(num_partitions):
      if partitioner.should_keep_partition(i):
        variants.append(partitioned_variants[i])
      else:
        num_partitions -= 1
  else:
    # By default we don't partition the data, so we have only 1 partition.
    num_partitions = 1
    variants = [variants]

  for i in range(num_partitions):
    if variant_merger:
      variants[i] |= ('MergeVariants' + str(i) >>
                      merge_variants.MergeVariants(variant_merger))
    variants[i] |= (
        'ProcessVaraints' + str(i) >>
        beam.Map(processed_variant_factory.create_processed_variant).\
            with_output_types(processed_variant.ProcessedVariant))
  if partitioner and partitioner.should_flatten():
    variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
    num_partitions = 1

  for i in range(num_partitions):
    table_suffix = ''
    if partitioner and partitioner.get_partition_name(i):
      table_suffix = '_' + partitioner.get_partition_name(i)
    table_name = known_args.output_table + table_suffix
    _ = (variants[i] | 'VariantToBigQuery' + table_suffix >>
         variant_to_bigquery.VariantToBigQuery(
             table_name,
             header_fields,
             variant_merger,
             processed_variant_factory,
             append=known_args.append,
             update_schema_on_append=known_args.update_schema_on_append,
             allow_incompatible_records=known_args.allow_incompatible_records,
             omit_empty_sample_calls=known_args.omit_empty_sample_calls,
             num_bigquery_write_shards=known_args.num_bigquery_write_shards))

  result = pipeline.run()
  result.wait_until_finish()

  metrics_util.log_all_counters(result)
 def test_create_processed_variant_symbolic_and_breakend_annotation_alt(
         self):
     # The returned variant is ignored as we create a custom one next.
     _, header_fields = self._get_sample_variant_and_header_with_csq()
     variant = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['<SYMBOLIC>', '[13:123457[.', 'C[10:10357[.'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         info={
             'CSQ': [
                 'SYMBOLIC|C1|I1|S1|G1', '[13|C2|I2|S2|G2',
                 'C[10|C3|I3|S3|G3', 'C[1|C3|I3|S3|G3'
             ]
         })  # The last one does not match any alts.
     counter_factory = _CounterSpyFactory()
     factory = processed_variant.ProcessedVariantFactory(
         header_fields,
         split_alternate_allele_info_fields=True,
         annotation_fields=['CSQ'],
         counter_factory=counter_factory)
     proc_var = factory.create_processed_variant(variant)
     alt1 = processed_variant.AlternateBaseData('<SYMBOLIC>')
     alt1._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'SYMBOLIC',
             'Consequence': 'C1',
             'IMPACT': 'I1',
             'SYMBOL': 'S1',
             'Gene': 'G1'
         }]
     }
     alt2 = processed_variant.AlternateBaseData('[13:123457[.')
     alt2._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: '[13',
             'Consequence': 'C2',
             'IMPACT': 'I2',
             'SYMBOL': 'S2',
             'Gene': 'G2'
         }]
     }
     alt3 = processed_variant.AlternateBaseData('C[10:10357[.')
     alt3._info = {
         'CSQ': [{
             annotation_parser.ANNOTATION_ALT: 'C[10',
             'Consequence': 'C3',
             'IMPACT': 'I3',
             'SYMBOL': 'S3',
             'Gene': 'G3'
         }]
     }
     self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3])
     self.assertFalse(proc_var.non_alt_info.has_key('CSQ'))
     self.assertEqual(
         counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3)
     self.assertEqual(
         counter_factory.counter_map[
             CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1)