Пример #1
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order(self):
        sample_ids = [
            hash_name('sample2'),
            hash_name('sample1'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]]),
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_sample_ids = (
            pipeline
            | transforms.Create(variants)
            | 'CombineSampleIds' >>
            combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
            | combiners.ToList())
        assert_that(combined_sample_ids, equal_to([sample_ids]))
        pipeline.run()
Пример #2
0
    def test_sample_ids_combiner_pipeline_duplicate_sample_ids(self):
        variant_call = vcfio.VariantCall(sample_id=hash_name('sample1'))
        variants = [vcfio.Variant(calls=[variant_call, variant_call])]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
             | combiners.ToList())
        with self.assertRaises(ValueError):
            pipeline.run()
Пример #3
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> List[str]
    """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
    options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    shard_files_job_name = pipeline_common.generate_unique_name(
        _SHARD_VCF_FILES_JOB_NAME)
    _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
    vcf_shards_output_dir = filesystems.FileSystems.join(
        known_args.annotation_output_dir, _SHARDS_FOLDER)
    with beam.Pipeline(options=options) as p:
        variants = _read_variants(known_args.all_patterns,
                                  p,
                                  known_args,
                                  pipeline_mode,
                                  pre_infer_headers=False,
                                  keep_raw_sample_names=True,
                                  use_1_based_coordinate=False)
        sample_ids = (
            variants
            | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
            | 'CombineToList' >> beam.combiners.ToList())
        # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
        # of sample names in the the sharded VCF files, which would lead to double
        # hashing of samples. Needs to be fixed ASAP.
        _ = (variants
             | 'DensifyVariants' >> densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                 vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids),
                 known_args.number_of_variants_per_shard))

    return [
        vep_runner_util.format_dir_path(vcf_shards_output_dir) +
        _GCS_RECURSIVE_WILDCARD
    ]
Пример #4
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order_error(self):
        sample_ids = [
            hash_name('sample1'),
            hash_name('sample2'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineSampleIds' >>
             combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
             | combiners.ToList())
        with self.assertRaises(ValueError):
            pipeline.run()
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the sample names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    variant_query = _get_variant_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', variant_query)
    project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
        known_args.input_table)
    bq_variant_source = bigquery.BigQuerySource(query=variant_query,
                                                validate=True,
                                                use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)

    base_table_id = bigquery_util.get_table_base_name(table_id)
    sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format(
        PROJECT_ID=project_id,
        DATASET_ID=dataset_id,
        TABLE_NAME=bigquery_util.compose_table_name(base_table_id,
                                                    SAMPLE_INFO_TABLE_SUFFIX))
    bq_sample_source = bigquery.BigQuerySource(query=sample_query,
                                               validate=True,
                                               use_standard_sql=True)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        sample_table_rows = (
            p
            | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source))
        if known_args.sample_names:
            temp_sample_names = (p
                                 | transforms.Create(known_args.sample_names,
                                                     reshuffle=False))
        else:
            # Get sample names from sample IDs in the variants and sort.
            id_to_name_hash_table = (sample_table_rows
                                     | 'SampleIdToNameDict' >>
                                     sample_mapping_table.SampleIdToNameDict())
            temp_sample_ids = (
                variants
                | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(
                    known_args.preserve_sample_order))
            temp_sample_names = (
                temp_sample_ids
                | 'GetSampleNames' >> sample_mapping_table.GetSampleNames(
                    beam.pvalue.AsSingleton(id_to_name_hash_table))
                | 'CombineToList' >> beam.combiners.ToList()
                | 'SortSampleNames' >> beam.ParDo(sorted))

        name_to_id_hash_table = (
            sample_table_rows
            |
            'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict())
        sample_ids = (temp_sample_names
                      | 'GetSampleIds' >> sample_mapping_table.GetSampleIds(
                          beam.pvalue.AsSingleton(name_to_id_hash_table))
                      | 'CombineSortedSampleIds' >> beam.combiners.ToList())
        sample_names = temp_sample_names | beam.combiners.ToList()

        _ = (sample_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))