def test_convert_bq_row_to_variant(self): row = self._get_big_query_row() expected_variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IFR': [0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FB': True }) ]) bq_to_variant = bigquery_to_variant.BigQueryToVariant() self.assertEqual(expected_variant, bq_to_variant._convert_bq_row_to_variant(row))
def test_pipeline(self): row, expected_variant = self._get_bigquery_row_and_variant() pipeline = test_pipeline.TestPipeline() variants = (pipeline | transforms.Create([row]) | bigquery_to_variant.BigQueryToVariant()) assert_that(variants, equal_to([expected_variant])) pipeline.run()
def test_alternate_bases(self): alternate_base_records = self._get_big_query_row()[ ColumnKeyConstants.ALTERNATE_BASES] expected_alternate_bases = ['A', 'TT'] bq_to_variant = bigquery_to_variant.BigQueryToVariant() self.assertEqual( expected_alternate_bases, bq_to_variant._get_alternate_bases(alternate_base_records))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the call names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) # TODO(allieychen): Modify the SQL query with the specified call_names. query = _get_bigquery_query(known_args, schema) logging.info('Processing BigQuery query %s:', query) bq_source = bigquery.BigQuerySource(query=query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) if known_args.call_names: call_names = (p | transforms.Create(known_args.call_names) | beam.combiners.ToList()) else: call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner( known_args.preserve_call_names_order)) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_call_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def test_get_variant_info(self): row = self._get_big_query_row() expected_variant_info = { 'IFR': [0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] } bq_to_variant = bigquery_to_variant.BigQueryToVariant() self.assertEqual(expected_variant_info, bq_to_variant._get_variant_info(row))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str vcf_data_header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the data header to `vcf_data_header_file_path`. TODO(allieychen): Eventually, it also generates the meta information file. """ bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant()) call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_data_header, _VCF_FIXED_COLUMNS, vcf_data_header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = vcf_to_bq_common.parse_args( argv, _COMMAND_LINE_OPTIONS) bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) options = pipeline_options.PipelineOptions(pipeline_args) with beam.Pipeline(options=options) as p: _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant() | densify_variants.DensifyVariants() | vcfio.WriteToVcf(known_args.output_file))
def test_get_variant_calls(self): variant_call_records = self._get_big_query_row()[ ColumnKeyConstants.CALLS] expected_calls = [ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FB': True }), ] bq_to_variant = bigquery_to_variant.BigQueryToVariant() self.assertEqual( expected_calls, bq_to_variant._get_variant_calls(variant_call_records))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))