def _merge_headers(known_args, pipeline_args, pipeline_mode): """Merges VCF headers using beam based on pipeline_mode.""" if known_args.representative_header_file: return options = PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == PipelineModes.SMALL and not known_args.infer_undefined_headers): options.view_as(StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(GoogleCloudOptions) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME else: google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() # Add a time prefix to ensure files are unique in case multiple # pipelines are run at the same time. temp_merged_headers_file_name = '-'.join([ datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) known_args.representative_header_file = FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = p if pipeline_mode == PipelineModes.LARGE: headers |= (beam.Create([known_args.input_pattern]) | vcf_header_io.ReadAllVcfHeaders()) else: headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern) merged_header = (headers | 'MergeHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields)) if known_args.infer_undefined_headers: merged_header = _add_inferred_headers(p, known_args, merged_header) _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders( known_args.representative_header_file))
def write_headers(merged_header, file_path): # type: (pvalue.PCollection, str) -> None """Writes a PCollection of ``VcfHeader`` to location ``file_path``.""" _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders(file_path))