def test_convert_bq_row_to_variant(self):
     row = self._get_big_query_row()
     expected_variant = vcfio.Variant(reference_name='chr19',
                                      start=11,
                                      end=12,
                                      reference_bases='C',
                                      alternate_bases=['A', 'TT'],
                                      names=['rs1', 'rs2'],
                                      quality=2,
                                      filters=['PASS'],
                                      info={
                                          'IFR': [0.2],
                                          'IFR2': [0.2, 0.3],
                                          'IS': 'some data',
                                          'ISR': ['data1', 'data2']
                                      },
                                      calls=[
                                          vcfio.VariantCall(name='Sample1',
                                                            genotype=[0, 1],
                                                            phaseset='*',
                                                            info={
                                                                'GQ': 20,
                                                                'FIR':
                                                                [10, 20]
                                                            }),
                                          vcfio.VariantCall(name='Sample2',
                                                            genotype=[1, 0],
                                                            info={
                                                                'GQ': 10,
                                                                'FB': True
                                                            })
                                      ])
     bq_to_variant = bigquery_to_variant.BigQueryToVariant()
     self.assertEqual(expected_variant,
                      bq_to_variant._convert_bq_row_to_variant(row))
    def test_pipeline(self):
        row, expected_variant = self._get_bigquery_row_and_variant()
        pipeline = test_pipeline.TestPipeline()
        variants = (pipeline
                    | transforms.Create([row])
                    | bigquery_to_variant.BigQueryToVariant())

        assert_that(variants, equal_to([expected_variant]))
        pipeline.run()
    def test_alternate_bases(self):
        alternate_base_records = self._get_big_query_row()[
            ColumnKeyConstants.ALTERNATE_BASES]

        expected_alternate_bases = ['A', 'TT']
        bq_to_variant = bigquery_to_variant.BigQueryToVariant()
        self.assertEqual(
            expected_alternate_bases,
            bq_to_variant._get_alternate_bases(alternate_base_records))
Пример #4
0
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the call names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    # TODO(allieychen): Modify the SQL query with the specified call_names.
    query = _get_bigquery_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', query)
    bq_source = bigquery.BigQuerySource(query=query,
                                        validate=True,
                                        use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        if known_args.call_names:
            call_names = (p
                          | transforms.Create(known_args.call_names)
                          | beam.combiners.ToList())
        else:
            call_names = (
                variants
                | 'CombineCallNames' >> combine_call_names.CallNamesCombiner(
                    known_args.preserve_call_names_order))

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_call_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
 def test_get_variant_info(self):
     row = self._get_big_query_row()
     expected_variant_info = {
         'IFR': [0.2],
         'IFR2': [0.2, 0.3],
         'IS': 'some data',
         'ISR': ['data1', 'data2']
     }
     bq_to_variant = bigquery_to_variant.BigQueryToVariant()
     self.assertEqual(expected_variant_info,
                      bq_to_variant._get_variant_info(row))
Пример #6
0
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        vcf_data_header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the data header to `vcf_data_header_file_path`.
  TODO(allieychen): Eventually, it also generates the meta information file.
  """
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
                    | bigquery_to_variant.BigQueryToVariant())
        call_names = (
            variants
            | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())

        _ = (call_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_data_header, _VCF_FIXED_COLUMNS,
                 vcf_data_header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(call_names))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines())
Пример #7
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    options = pipeline_options.PipelineOptions(pipeline_args)
    with beam.Pipeline(options=options) as p:
        _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
             | bigquery_to_variant.BigQueryToVariant()
             | densify_variants.DensifyVariants()
             | vcfio.WriteToVcf(known_args.output_file))
    def test_get_variant_calls(self):
        variant_call_records = self._get_big_query_row()[
            ColumnKeyConstants.CALLS]

        expected_calls = [
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              phaseset='*',
                              info={
                                  'GQ': 20,
                                  'FIR': [10, 20]
                              }),
            vcfio.VariantCall(name='Sample2',
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FB': True
                              }),
        ]

        bq_to_variant = bigquery_to_variant.BigQueryToVariant()
        self.assertEqual(
            expected_calls,
            bq_to_variant._get_variant_calls(variant_call_records))
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the sample names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    variant_query = _get_variant_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', variant_query)
    project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
        known_args.input_table)
    bq_variant_source = bigquery.BigQuerySource(query=variant_query,
                                                validate=True,
                                                use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)

    base_table_id = bigquery_util.get_table_base_name(table_id)
    sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format(
        PROJECT_ID=project_id,
        DATASET_ID=dataset_id,
        TABLE_NAME=bigquery_util.compose_table_name(base_table_id,
                                                    SAMPLE_INFO_TABLE_SUFFIX))
    bq_sample_source = bigquery.BigQuerySource(query=sample_query,
                                               validate=True,
                                               use_standard_sql=True)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        sample_table_rows = (
            p
            | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source))
        if known_args.sample_names:
            temp_sample_names = (p
                                 | transforms.Create(known_args.sample_names,
                                                     reshuffle=False))
        else:
            # Get sample names from sample IDs in the variants and sort.
            id_to_name_hash_table = (sample_table_rows
                                     | 'SampleIdToNameDict' >>
                                     sample_mapping_table.SampleIdToNameDict())
            temp_sample_ids = (
                variants
                | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(
                    known_args.preserve_sample_order))
            temp_sample_names = (
                temp_sample_ids
                | 'GetSampleNames' >> sample_mapping_table.GetSampleNames(
                    beam.pvalue.AsSingleton(id_to_name_hash_table))
                | 'CombineToList' >> beam.combiners.ToList()
                | 'SortSampleNames' >> beam.ParDo(sorted))

        name_to_id_hash_table = (
            sample_table_rows
            |
            'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict())
        sample_ids = (temp_sample_names
                      | 'GetSampleIds' >> sample_mapping_table.GetSampleIds(
                          beam.pvalue.AsSingleton(name_to_id_hash_table))
                      | 'CombineSortedSampleIds' >> beam.combiners.ToList())
        sample_names = temp_sample_names | beam.combiners.ToList()

        _ = (sample_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))