def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants( known_args.all_patterns, p, known_args, pipeline_mode) call_names = (variants | 'CombineCallNames' >> combine_call_names.CallNamesCombiner()) _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(call_names)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(call_names), known_args.number_of_variants_per_shard)) return [vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD]
def _annotate_vcf_files(all_patterns, known_args, pipeline_args): # type: (List[str], argparse.Namespace, List[str]) -> str """Annotates the VCF files using VEP. Returns: The annotated VCF files directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) annotate_files_job_name = pipeline_common.generate_unique_name( _ANNOTATE_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, annotate_files_job_name) with beam.Pipeline(options=options) as p: _ = (p | beam.Create(all_patterns) | 'AnnotateShards' >> beam.ParDo( annotate_files.AnnotateFile(known_args, pipeline_args))) if known_args.annotation_fields: known_args.annotation_fields.append(known_args.vep_info_field) else: known_args.annotation_fields = [known_args.vep_info_field] # TODO(bashir2): The VEP runner by default runs VEP with --allele_number hence # we turn on this feature here. However, this might be inconsistent with other # annotation fields that are originally present in input files, if they do not # have ALLELE_NUM annotation. The fix is to make annotation ALT matching # smarter to fall back on other matching methods if ALLELE_NUM is not present. # When this is implemented, we may even consider removing use_allele_num flag # and always start by checking if ALLELE_NUM is present. known_args.use_allele_num = True return vep_runner_util.get_output_pattern(known_args.annotation_output_dir)
def _merge_headers(known_args, pipeline_args, pipeline_mode, annotated_vcf_pattern=None): # type: (str, argparse.Namespace, List[str], int, str) -> None """Merges VCF headers using beam based on pipeline_mode.""" options = pipeline_options.PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == pipeline_common.PipelineModes.SMALL and not known_args.infer_headers and not known_args.infer_annotation_types): options.view_as( pipeline_options.StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) merge_headers_job_name = pipeline_common.generate_unique_name( _MERGE_HEADERS_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + merge_headers_job_name else: google_cloud_options.job_name = merge_headers_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_merged_headers_file_name = '-'.join( [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) temp_merged_headers_file_path = filesystems.FileSystems.join( temp_directory, temp_merged_headers_file_name) if not known_args.append: bigquery_util.create_sample_info_table(known_args.output_table) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, known_args.all_patterns) _ = (headers | 'SampleInfoToBigQuery' >> sample_info_to_bigquery.SampleInfoToBigQuery( known_args.output_table, SampleNameEncoding[ known_args.sample_name_encoding], known_args.append)) if known_args.representative_header_file: return merged_header = pipeline_common.get_merged_headers( headers, known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records) if annotated_vcf_pattern: merged_header = pipeline_common.add_annotation_headers( p, known_args, pipeline_mode, merged_header, annotated_vcf_pattern) if known_args.infer_headers or known_args.infer_annotation_types: infer_headers_input_pattern = ([ annotated_vcf_pattern ] if annotated_vcf_pattern else known_args.all_patterns) merged_header = _add_inferred_headers(infer_headers_input_pattern, p, known_args, merged_header, pipeline_mode) pipeline_common.write_headers(merged_header, temp_merged_headers_file_path) known_args.representative_header_file = temp_merged_headers_file_path
def _get_input_dimensions(known_args, pipeline_args): pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns) beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = beam_pipeline_options.view_as( pipeline_options.GoogleCloudOptions) estimate_sizes_job_name = pipeline_common.generate_unique_name( _ESTIMATE_SIZES_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + estimate_sizes_job_name else: google_cloud_options.job_name = estimate_sizes_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_estimated_input_size_file_name = '-'.join( [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME]) temp_estimated_input_size_file_path = filesystems.FileSystems.join( temp_directory, temp_estimated_input_size_file_name) with beam.Pipeline(options=beam_pipeline_options) as p: estimates = pipeline_common.get_estimates(p, pipeline_mode, known_args.all_patterns) files_size = (estimates | 'GetFilesSize' >> extract_input_size.GetFilesSize()) file_count = (estimates | 'CountAllFiles' >> beam.combiners.Count.Globally()) sample_map = (estimates | 'ExtractSampleMap' >> extract_input_size.GetSampleMap()) estimated_value_count = (sample_map | extract_input_size.GetEstimatedValueCount()) estimated_sample_count = (sample_map | extract_input_size.GetEstimatedSampleCount()) estimated_variant_count = ( estimates | 'GetEstimatedVariantCount' >> extract_input_size.GetEstimatedVariantCount()) _ = (estimated_variant_count | beam.ParDo(extract_input_size.print_estimates_to_file, beam.pvalue.AsSingleton(estimated_sample_count), beam.pvalue.AsSingleton(estimated_value_count), beam.pvalue.AsSingleton(files_size), beam.pvalue.AsSingleton(file_count), temp_estimated_input_size_file_path)) with filesystems.FileSystems.open( temp_estimated_input_size_file_path) as f: estimates = f.readlines() if len(estimates) != 5: raise ValueError('Exactly 5 estimates were expected in {}.'.format( temp_estimated_input_size_file_path)) known_args.estimated_variant_count = int(estimates[0].strip()) known_args.estimated_sample_count = int(estimates[1].strip()) known_args.estimated_value_count = int(estimates[2].strip()) known_args.files_size = int(estimates[3].strip()) known_args.file_count = int(estimates[4].strip())
def run(argv=None): # type: (List[str]) -> None """Runs BigQuery to VCF pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args(argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) is_direct_runner = pipeline_common.is_pipeline_direct_runner( beam.Pipeline(options=options)) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) if not google_cloud_options.project: raise ValueError('project must be set.') if not is_direct_runner and not known_args.output_file.startswith('gs://'): raise ValueError('Please set the output file {} to GCS when running with ' 'DataflowRunner.'.format(known_args.output_file)) if is_direct_runner: known_args.number_of_bases_per_shard = sys.maxsize temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp() unique_temp_id = pipeline_common.generate_unique_name( google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME) vcf_data_temp_folder = filesystems.FileSystems.join( temp_folder, '{}_data_temp_files'.format(unique_temp_id)) # Create the directory manually. FileSystems cannot create a file if the # directory does not exist when using Direct Runner. filesystems.FileSystems.mkdirs(vcf_data_temp_folder) vcf_header_file_path = filesystems.FileSystems.join( temp_folder, '{}_header_with_sample_ids.vcf'.format(unique_temp_id)) if not known_args.representative_header_file: known_args.representative_header_file = filesystems.FileSystems.join( temp_folder, '{}_meta_info.vcf'.format(unique_temp_id)) _write_vcf_meta_info(known_args.input_table, known_args.representative_header_file, known_args.allow_incompatible_schema) _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder, vcf_header_file_path) if is_direct_runner: vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path, vcf_data_temp_folder, known_args.output_file) else: vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project, vcf_header_file_path, vcf_data_temp_folder, known_args.output_file)
def _shard_variants(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> List[str] """Reads the variants and writes them to VCF shards. Returns: The VCF shards directory. """ options = pipeline_options.PipelineOptions(pipeline_args) google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) shard_files_job_name = pipeline_common.generate_unique_name( _SHARD_VCF_FILES_JOB_NAME) _update_google_cloud_job_name(google_cloud_options, shard_files_job_name) vcf_shards_output_dir = filesystems.FileSystems.join( known_args.annotation_output_dir, _SHARDS_FOLDER) with beam.Pipeline(options=options) as p: variants = _read_variants(known_args.all_patterns, p, known_args, pipeline_mode, pre_infer_headers=False, keep_raw_sample_names=True, use_1_based_coordinate=False) sample_ids = ( variants | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | 'CombineToList' >> beam.combiners.ToList()) # TODO(tneymanov): Annotation pipeline currently stores sample IDs instead # of sample names in the the sharded VCF files, which would lead to double # hashing of samples. Needs to be fixed ASAP. _ = (variants | 'DensifyVariants' >> densify_variants.DensifyVariants( beam.pvalue.AsSingleton(sample_ids)) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( vcf_shards_output_dir, beam.pvalue.AsSingleton(sample_ids), known_args.number_of_variants_per_shard)) return [ vep_runner_util.format_dir_path(vcf_shards_output_dir) + _GCS_RECURSIVE_WILDCARD ]
def _merge_headers(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> None """Merges VCF headers using beam based on pipeline_mode.""" if known_args.representative_header_file: return options = pipeline_options.PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == pipeline_common.PipelineModes.SMALL and not known_args.infer_headers and not known_args.infer_annotation_types): options.view_as( pipeline_options.StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) merge_headers_job_name = pipeline_common.generate_unique_name( _MERGE_HEADERS_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + merge_headers_job_name else: google_cloud_options.job_name = merge_headers_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_merged_headers_file_name = '-'.join( [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) temp_merged_headers_file_path = filesystems.FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, known_args) merged_header = pipeline_common.get_merged_headers( headers, known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records) if known_args.infer_headers or known_args.infer_annotation_types: merged_header = _add_inferred_headers(p, known_args, merged_header) pipeline_common.write_headers(merged_header, temp_merged_headers_file_path) known_args.representative_header_file = temp_merged_headers_file_path