Exemplo n.º 1
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
  # type: (argparse.Namespace, List[str], int) -> List[str]
  """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  shard_files_job_name = pipeline_common.generate_unique_name(
      _SHARD_VCF_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
  vcf_shards_output_dir = filesystems.FileSystems.join(
      known_args.annotation_output_dir, _SHARDS_FOLDER)
  with beam.Pipeline(options=options) as p:
    variants = _read_variants(
        known_args.all_patterns, p, known_args, pipeline_mode)
    call_names = (variants
                  | 'CombineCallNames' >>
                  combine_call_names.CallNamesCombiner())
    _ = (variants
         | 'DensifyVariants' >> densify_variants.DensifyVariants(
             beam.pvalue.AsSingleton(call_names))
         | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
             vcf_shards_output_dir,
             beam.pvalue.AsSingleton(call_names),
             known_args.number_of_variants_per_shard))

  return [vep_runner_util.format_dir_path(vcf_shards_output_dir) +
          _GCS_RECURSIVE_WILDCARD]
Exemplo n.º 2
0
 def _check_and_write_to_output_dir(self, output_dir):
   # type: (str) -> None
   real_dir = vep_runner_util.format_dir_path(output_dir)
   # NOTE(bashir2): We cannot use exists() because for example on GCS, the
   # directory names are only symbolic and are not physical files.
   match_results = filesystems.FileSystems.match(['{}*'.format(real_dir)])
   if match_results and match_results[0].metadata_list:
     raise ValueError('Output directory {} already exists.'.format(real_dir))
   log_file = filesystems.FileSystems.create(
       filesystems.FileSystems.join(output_dir, _GLOBAL_LOG_FILE))
   # TODO(bashir2): Instead of just creating an empty file, log some
   # information about how the VEP pipelines are executed.
   log_file.close()
Exemplo n.º 3
0
def _validate_annotation_pipeline_args(known_args, pipeline_args):
  match_results = filesystems.FileSystems.match(['{}*'.format(
      vep_runner_util.format_dir_path(known_args.annotation_output_dir))])
  if match_results and match_results[0].metadata_list:
    raise ValueError('Output directory {} already exists.'.format(
        known_args.annotation_output_dir))

  flags_dict = pipeline_options.PipelineOptions(pipeline_args).get_all_options()
  expected_flags = ['max_num_workers', 'num_workers']
  for flag in expected_flags:
    if flag in flags_dict and flags_dict[flag] > 0:
      return
  raise ValueError('Could not find any of {} with a valid value among pipeline '
                   'flags {}'.format(expected_flags, flags_dict))
Exemplo n.º 4
0
def _shard_variants(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> List[str]
    """Reads the variants and writes them to VCF shards.

  Returns:
   The VCF shards directory.
  """
    options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    shard_files_job_name = pipeline_common.generate_unique_name(
        _SHARD_VCF_FILES_JOB_NAME)
    _update_google_cloud_job_name(google_cloud_options, shard_files_job_name)
    vcf_shards_output_dir = filesystems.FileSystems.join(
        known_args.annotation_output_dir, _SHARDS_FOLDER)
    with beam.Pipeline(options=options) as p:
        variants = _read_variants(known_args.all_patterns,
                                  p,
                                  known_args,
                                  pipeline_mode,
                                  pre_infer_headers=False,
                                  keep_raw_sample_names=True,
                                  use_1_based_coordinate=False)
        sample_ids = (
            variants
            | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
            | 'CombineToList' >> beam.combiners.ToList())
        # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead
        # of sample names in the the sharded VCF files, which would lead to double
        # hashing of samples. Needs to be fixed ASAP.
        _ = (variants
             | 'DensifyVariants' >> densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                 vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids),
                 known_args.number_of_variants_per_shard))

    return [
        vep_runner_util.format_dir_path(vcf_shards_output_dir) +
        _GCS_RECURSIVE_WILDCARD
    ]