def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants( known_args.all_patterns, p, known_args, pipeline_mode) call_names = (variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(call_names), known_args.number_of_variants_per_shard)) return [vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD]
def test_call_names_combiner_pipeline_duplicate_call_names(self): variant_call = vcfio.VariantCall(name='sample1') variants = [vcfio.Variant(calls=[variant_call, variant_call])] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) with self.assertRaises(ValueError): pipeline.run()
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the meta info and data header with the call names to `vcf_header_file_path`. """ schema = _get_schema(known_args.input_table) # TODO(allieychen): Modify the SQL query with the specified call_names. query = _get_bigquery_query(known_args, schema) logging.info('Processing BigQuery query %s:', query) bq_source = bigquery.BigQuerySource(query=query, validate=True, use_standard_sql=True) annotation_names = _extract_annotation_names(schema) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant(annotation_names)) if known_args.call_names: call_names = (p | transforms.Create(known_args.call_names) | beam.combiners.ToList()) else: call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner( known_args.preserve_call_names_order)) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_header_with_call_names, _VCF_FIXED_COLUMNS, known_args.representative_header_file, header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def _bigquery_to_vcf_shards( known_args, # type: argparse.Namespace beam_pipeline_options, # type: pipeline_options.PipelineOptions vcf_data_temp_folder, # type: str vcf_data_header_file_path, # type: str ): # type: (...) -> None """Runs BigQuery to VCF shards pipelines. It reads the variants from BigQuery table, groups a collection of variants within a contiguous region of the genome (the size of the collection is adjustable through flag `--number_of_bases_per_shard`), sorts them, and then writes to one VCF file. All VCF data files are saved in `vcf_data_temp_folder`. Also, it writes the data header to `vcf_data_header_file_path`. TODO(allieychen): Eventually, it also generates the meta information file. """ bq_source = bigquery.BigQuerySource( query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join( bigquery_util.parse_table_reference(known_args.input_table))), validate=True, use_standard_sql=True) with beam.Pipeline(options=beam_pipeline_options) as p: variants = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source) | bigquery_to_variant.BigQueryToVariant()) call_names = ( variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (call_names | 'GenerateVcfDataHeader' >> beam.ParDo( _write_vcf_data_header, _VCF_FIXED_COLUMNS, vcf_data_header_file_path)) _ = (variants | densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'PairVariantWithKey' >> beam.Map( _pair_variant_with_key, known_args.number_of_bases_per_shard) | 'GroupVariantsByKey' >> beam.GroupByKey() | beam.ParDo(_get_file_path_and_sorted_variants, vcf_data_temp_folder) | vcfio.WriteVcfDataLines())
def test_call_names_combiner_pipeline(self): call_names = ['sample1', 'sample2', 'sample3'] variant_calls = [ vcfio.VariantCall(name=call_names[0]), vcfio.VariantCall(name=call_names[1]), vcfio.VariantCall(name=call_names[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_call_names = ( pipeline | transforms.Create(variants) | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) assert_that(combined_call_names, equal_to([call_names])) pipeline.run()
def test_call_names_combiner_pipeline_preserve_call_names_order_error( self): call_names = ['sample1', 'sample2', 'sample3'] variant_calls = [ vcfio.VariantCall(name=call_names[0]), vcfio.VariantCall(name=call_names[1]), vcfio.VariantCall(name=call_names[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineCallNames' >> combine_call_names.CallNamesCombiner( preserve_call_names_order=True)) with self.assertRaises(ValueError): pipeline.run()