def _merge_headers(known_args, pipeline_args, pipeline_mode, annotated_vcf_pattern=None): # type: (str, argparse.Namespace, List[str], int, str) -> None """Merges VCF headers using beam based on pipeline_mode.""" options = pipeline_options.PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == pipeline_common.PipelineModes.SMALL and not known_args.infer_headers and not known_args.infer_annotation_types): options.view_as( pipeline_options.StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) merge_headers_job_name = pipeline_common.generate_unique_name( _MERGE_HEADERS_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + merge_headers_job_name else: google_cloud_options.job_name = merge_headers_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_merged_headers_file_name = '-'.join( [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) temp_merged_headers_file_path = filesystems.FileSystems.join( temp_directory, temp_merged_headers_file_name) if not known_args.append: bigquery_util.create_sample_info_table(known_args.output_table) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, known_args.all_patterns) _ = (headers | 'SampleInfoToBigQuery' >> sample_info_to_bigquery.SampleInfoToBigQuery( known_args.output_table, SampleNameEncoding[ known_args.sample_name_encoding], known_args.append)) if known_args.representative_header_file: return merged_header = pipeline_common.get_merged_headers( headers, known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records) if annotated_vcf_pattern: merged_header = pipeline_common.add_annotation_headers( p, known_args, pipeline_mode, merged_header, annotated_vcf_pattern) if known_args.infer_headers or known_args.infer_annotation_types: infer_headers_input_pattern = ([ annotated_vcf_pattern ] if annotated_vcf_pattern else known_args.all_patterns) merged_header = _add_inferred_headers(infer_headers_input_pattern, p, known_args, merged_header, pipeline_mode) pipeline_common.write_headers(merged_header, temp_merged_headers_file_path) known_args.representative_header_file = temp_merged_headers_file_path
def run(argv=None): # type: (List[str]) -> (str, str) """Runs preprocess pipeline.""" logging.info('Command: %s', ' '.join(argv or sys.argv)) known_args, pipeline_args = pipeline_common.parse_args( argv, _COMMAND_LINE_OPTIONS) options = pipeline_options.PipelineOptions(pipeline_args) all_patterns = known_args.all_patterns pipeline_mode = pipeline_common.get_pipeline_mode(all_patterns) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, all_patterns) merged_headers = pipeline_common.get_merged_headers(headers) merged_definitions = ( headers | 'MergeDefinitions' >> merge_header_definitions.MergeDefinitions()) if known_args.report_all_conflicts: if len(all_patterns) == 1: variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf( all_patterns[0], allow_malformed_records=True) else: variants = (p | 'InputFilePattern' >> beam.Create(all_patterns) | 'ReadAllFromVcf' >> vcfio.ReadAllFromVcf(allow_malformed_records=True)) malformed_records = variants | filter_variants.ExtractMalformedVariants( ) inferred_headers, merged_headers = (_get_inferred_headers( variants, merged_headers)) _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers), beam.pvalue.AsSingleton(inferred_headers), beam.pvalue.AsIter(malformed_records))) else: _ = (merged_definitions | 'GenerateConflictsReport' >> beam.ParDo( preprocess_reporter.generate_report, known_args.report_path, beam.pvalue.AsSingleton(merged_headers))) if known_args.resolved_headers_path: pipeline_common.write_headers(merged_headers, known_args.resolved_headers_path)
def _merge_headers(known_args, pipeline_args, pipeline_mode): # type: (argparse.Namespace, List[str], int) -> None """Merges VCF headers using beam based on pipeline_mode.""" if known_args.representative_header_file: return options = pipeline_options.PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == pipeline_common.PipelineModes.SMALL and not known_args.infer_headers and not known_args.infer_annotation_types): options.view_as( pipeline_options.StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions) merge_headers_job_name = pipeline_common.generate_unique_name( _MERGE_HEADERS_JOB_NAME) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + merge_headers_job_name else: google_cloud_options.job_name = merge_headers_job_name temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() temp_merged_headers_file_name = '-'.join( [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) temp_merged_headers_file_path = filesystems.FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = pipeline_common.read_headers(p, pipeline_mode, known_args) merged_header = pipeline_common.get_merged_headers( headers, known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records) if known_args.infer_headers or known_args.infer_annotation_types: merged_header = _add_inferred_headers(p, known_args, merged_header) pipeline_common.write_headers(merged_header, temp_merged_headers_file_path) known_args.representative_header_file = temp_merged_headers_file_path