예제 #1
0
def _merge_headers(known_args, pipeline_args, pipeline_mode):
    # type: (argparse.Namespace, List[str], int) -> None
    """Merges VCF headers using beam based on pipeline_mode."""
    if known_args.representative_header_file:
        return

    options = pipeline_options.PipelineOptions(pipeline_args)

    # Always run pipeline locally if data is small.
    if (pipeline_mode == vcf_to_bq_common.PipelineModes.SMALL
            and not known_args.infer_headers):
        options.view_as(
            pipeline_options.StandardOptions).runner = 'DirectRunner'

    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    # Add a time suffix to ensure the job names and the merged headers files are
    # unique in case multiple pipelines are run at the same time.
    merge_headers_job_name = '-'.join([
        _MERGE_HEADERS_JOB_NAME,
        datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    ])
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + merge_headers_job_name
    else:
        google_cloud_options.job_name = merge_headers_job_name

    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_merged_headers_file_name = '-'.join(
        [google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME])
    temp_merged_headers_file_path = filesystems.FileSystems.join(
        temp_directory, temp_merged_headers_file_name)

    with beam.Pipeline(options=options) as p:
        headers = vcf_to_bq_common.read_headers(p, pipeline_mode, known_args)
        merged_header = vcf_to_bq_common.get_merged_headers(
            headers, known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records)
        if known_args.infer_headers or known_args.infer_annotation_types:
            merged_header = _add_inferred_headers(p, known_args, merged_header)
        vcf_to_bq_common.write_headers(merged_header,
                                       temp_merged_headers_file_path)
        known_args.representative_header_file = temp_merged_headers_file_path
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
        known_args.input_pattern)

    with beam.Pipeline(options=options) as p:
        headers = vcf_to_bq_common.read_headers(p, pipeline_mode, known_args)
        merged_headers = vcf_to_bq_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                known_args.input_pattern, allow_malformed_records=True)
            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            vcf_to_bq_common.write_headers(merged_headers,
                                           known_args.resolved_headers_path)