예제 #1
0
def _annotate_vcf_files(all_patterns, known_args, pipeline_args):
  # type: (List[str], argparse.Namespace, List[str]) -> str
  """Annotates the VCF files using VEP.

  Returns:
    The annotated VCF files directory.
  """
  options = pipeline_options.PipelineOptions(pipeline_args)
  google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
  annotate_files_job_name = pipeline_common.generate_unique_name(
      _ANNOTATE_FILES_JOB_NAME)
  _update_google_cloud_job_name(google_cloud_options, annotate_files_job_name)

  with beam.Pipeline(options=options) as p:
    _ = (p
         | beam.Create(all_patterns)
         | 'AnnotateShards' >> beam.ParDo(
             annotate_files.AnnotateFile(known_args, pipeline_args)))
  if known_args.annotation_fields:
    known_args.annotation_fields.append(known_args.vep_info_field)
  else:
    known_args.annotation_fields = [known_args.vep_info_field]
  # TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence
  # we turn on this feature here. However, this might be inconsistent with other
  # annotation fields that are originally present in input files, if they do not
  # have ALLELE_NUM annotation. The fix is to make annotation ALT matching
  # smarter to fall back on other matching methods if ALLELE_NUM is not present.
  # When this is implemented, we may even consider removing use_allele_num flag
  # and always start by checking if ALLELE_NUM is present.
  known_args.use_allele_num = True
  return vep_runner_util.get_output_pattern(known_args.annotation_output_dir)
 def get_output_pattern(self):
     # type: () -> str
     return vep_runner_util.get_output_pattern(self._output_dir)