def _validate_output_tables(self, client, output_table_base_name, sharding_config_path, append, is_main_output): if (output_table_base_name != bigquery_util.get_table_base_name(output_table_base_name)): raise ValueError( ('Output table cannot contain "{}". we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( output_table_base_name) bigquery_util.raise_error_if_dataset_not_exists( client, project_id, dataset_id) all_output_tables = [] if is_main_output: all_output_tables.append( bigquery_util.compose_table_name(table_id, SAMPLE_INFO_TABLE_SUFFIX)) sharding = variant_sharding.VariantSharding(sharding_config_path) num_shards = sharding.get_num_shards() # In case there is no residual in config we will ignore the last shard. if not sharding.should_keep_shard(sharding.get_residual_index()): num_shards -= 1 for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) if table_suffix != bigquery_util.get_table_base_name(table_suffix): raise ValueError( ('Table suffix cannot contain "{}" we reserve this ' 'string to mark sharded output tables.').format( bigquery_util.TABLE_SUFFIX_SEPARATOR)) all_output_tables.append( bigquery_util.compose_table_name(table_id, table_suffix)) for output_table in all_output_tables: if append: if not bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError( 'Table {}:{}.{} does not exist, cannot append to it.'. format(project_id, dataset_id, output_table)) else: if bigquery_util.table_exist(client, project_id, dataset_id, output_table): raise ValueError(( 'Table {}:{}.{} already exists, cannot overwrite it. Please ' 'set `--append True` if you want to append to it.' ).format(project_id, dataset_id, output_table))
def test_get_table_base_name(self): without_suffix1 = 'project_id.dataset_id.table_id' without_suffix2 = 'project_id:dataset_id.table_id' self.assertEqual(without_suffix1, bigquery_util.get_table_base_name(without_suffix1)) self.assertEqual(without_suffix2, bigquery_util.get_table_base_name(without_suffix2)) with_suffix1 = without_suffix1 + '___chr1' with_suffix2 = without_suffix2 + '___chr1' self.assertEqual(without_suffix1, bigquery_util.get_table_base_name(with_suffix1)) self.assertEqual(without_suffix2, bigquery_util.get_table_base_name(with_suffix2)) with_two_suffixes1 = with_suffix1 + '___extra_suffix' with_two_suffixes2 = with_suffix2 + '___extra_suffix' self.assertEqual(without_suffix1, bigquery_util.get_table_base_name(with_two_suffixes1)) self.assertEqual(without_suffix2, bigquery_util.get_table_base_name(with_two_suffixes2))
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the sample names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) variant_query = _get_variant_query(known_args, schema) logging.info('Processing BigQuery query %s:', variant_query) project_id, dataset_id, table_id = bigquery_util.parse_table_reference( known_args.input_table) bq_variant_source = bigquery.BigQuerySource(query=variant_query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) base_table_id = bigquery_util.get_table_base_name(table_id) sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format( PROJECT_ID=project_id, DATASET_ID=dataset_id, TABLE_NAME=bigquery_util.compose_table_name(base_table_id, SAMPLE_INFO_TABLE_SUFFIX)) bq_sample_source = bigquery.BigQuerySource(query=sample_query, validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) sample_table_rows = ( p | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source)) if known_args.sample_names: temp_sample_names = (p | transforms.Create(known_args.sample_names, reshuffle=False)) else: # Get sample names from sample IDs in the variants and sort. id_to_name_hash_table = (sample_table_rows | 'SampleIdToNameDict' >> sample_mapping_table.SampleIdToNameDict()) temp_sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner( known_args.preserve_sample_order)) temp_sample_names = ( temp_sample_ids | 'GetSampleNames' >> sample_mapping_table.GetSampleNames( beam.pvalue.AsSingleton(id_to_name_hash_table)) | 'CombineToList' >> beam.combiners.ToList() | 'SortSampleNames' >> beam.ParDo(sorted)) name_to_id_hash_table = ( sample_table_rows | 'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict()) sample_ids = (temp_sample_names | 'GetSampleIds' >> sample_mapping_table.GetSampleIds( beam.pvalue.AsSingleton(name_to_id_hash_table)) | 'CombineSortedSampleIds' >> beam.combiners.ToList()) sample_names = temp_sample_names | beam.combiners.ToList() _ = (sample_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))