示例#1
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    # TODO(allieychen): Add support for local location.
    if not google_cloud_options.temp_location or not google_cloud_options.project:
        raise ValueError('temp_location and project must be set.')

    timestamp_str = datetime.now().strftime('%Y%m%d_%H%M%S')
    vcf_data_temp_folder = filesystems.FileSystems.join(
        google_cloud_options.temp_location,
        'bq_to_vcf_data_temp_files_{}'.format(timestamp_str))
    vcf_data_header_file_path = filesystems.FileSystems.join(
        google_cloud_options.temp_location,
        'bq_to_vcf_data_header_{}'.format(timestamp_str))

    _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder,
                            vcf_data_header_file_path)
    vcf_file_composer.compose_vcf_shards(google_cloud_options.project,
                                         vcf_data_header_file_path,
                                         vcf_data_temp_folder,
                                         known_args.output_file)
示例#2
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    is_direct_runner = _is_direct_runner(beam.Pipeline(options=options))
    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
    if not google_cloud_options.project:
        raise ValueError('project must be set.')
    if not is_direct_runner and not known_args.output_file.startswith('gs://'):
        raise ValueError(
            'Please set the output file {} to GCS when running with '
            'DataflowRunner.'.format(known_args.output_file))
    if is_direct_runner:
        known_args.number_of_bases_per_shard = sys.maxsize

    temp_folder = google_cloud_options.temp_location or tempfile.mkdtemp()
    # TODO(allieychen): Refactor the generation of the unique temp id to a common
    # lib.
    unique_temp_id = '-'.join([
        google_cloud_options.job_name or _BQ_TO_VCF_SHARDS_JOB_NAME,
        datetime.now().strftime('%Y%m%d-%H%M%S'),
        str(uuid.uuid4())
    ])
    vcf_data_temp_folder = filesystems.FileSystems.join(
        temp_folder, '{}_data_temp_files'.format(unique_temp_id))
    # Create the directory manually. FileSystems cannot create a file if the
    # directory does not exist when using Direct Runner.
    filesystems.FileSystems.mkdirs(vcf_data_temp_folder)
    vcf_header_file_path = filesystems.FileSystems.join(
        temp_folder, '{}_header_with_call_names.vcf'.format(unique_temp_id))

    if not known_args.representative_header_file:
        known_args.representative_header_file = filesystems.FileSystems.join(
            temp_folder, '{}_meta_info.vcf'.format(unique_temp_id))
        _write_vcf_meta_info(known_args.input_table,
                             known_args.representative_header_file,
                             known_args.allow_incompatible_schema)

    _bigquery_to_vcf_shards(known_args, options, vcf_data_temp_folder,
                            vcf_header_file_path)
    if is_direct_runner:
        vcf_file_composer.compose_local_vcf_shards(vcf_header_file_path,
                                                   vcf_data_temp_folder,
                                                   known_args.output_file)
    else:
        vcf_file_composer.compose_gcs_vcf_shards(google_cloud_options.project,
                                                 vcf_header_file_path,
                                                 vcf_data_temp_folder,
                                                 known_args.output_file)
def run(argv=None):
    # type: (List[str]) -> None
    """Runs BigQuery to VCF pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    bq_source = bigquery.BigQuerySource(
        query=_BASE_QUERY_TEMPLATE.format(INPUT_TABLE='.'.join(
            bigquery_util.parse_table_reference(known_args.input_table))),
        validate=True,
        use_standard_sql=True)

    options = pipeline_options.PipelineOptions(pipeline_args)
    with beam.Pipeline(options=options) as p:
        _ = (p | 'ReadFromBigQuery ' >> beam.io.Read(bq_source)
             | bigquery_to_variant.BigQueryToVariant()
             | densify_variants.DensifyVariants()
             | vcfio.WriteToVcf(known_args.output_file))
def run(argv=None):
    # type: (List[str]) -> (str, str)
    """Runs preprocess pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
        known_args.input_pattern)

    with beam.Pipeline(options=options) as p:
        headers = vcf_to_bq_common.read_headers(p, pipeline_mode, known_args)
        merged_headers = vcf_to_bq_common.get_merged_headers(headers)
        merged_definitions = (
            headers
            |
            'MergeDefinitions' >> merge_header_definitions.MergeDefinitions())
        if known_args.report_all_conflicts:
            variants = p | 'ReadFromVcf' >> vcfio.ReadFromVcf(
                known_args.input_pattern, allow_malformed_records=True)
            malformed_records = variants | filter_variants.ExtractMalformedVariants(
            )
            inferred_headers, merged_headers = (_get_inferred_headers(
                variants, merged_headers))
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers),
                     beam.pvalue.AsSingleton(inferred_headers),
                     beam.pvalue.AsIter(malformed_records)))
        else:
            _ = (merged_definitions
                 | 'GenerateConflictsReport' >> beam.ParDo(
                     preprocess_reporter.generate_report,
                     known_args.report_path,
                     beam.pvalue.AsSingleton(merged_headers)))

        if known_args.resolved_headers_path:
            vcf_to_bq_common.write_headers(merged_headers,
                                           known_args.resolved_headers_path)
示例#5
0
def run(argv=None):
    # type: (List[str]) -> None
    """Runs VCF to BigQuery pipeline."""
    logging.info('Command: %s', ' '.join(argv or sys.argv))
    known_args, pipeline_args = vcf_to_bq_common.parse_args(
        argv, _COMMAND_LINE_OPTIONS)
    # Note VepRunner creates new input files, so it should be run before any
    # other access to known_args.input_pattern.
    if known_args.run_annotation_pipeline:
        runner = vep_runner.create_runner_and_update_args(
            known_args, pipeline_args)
        runner.run_on_all_files()
        runner.wait_until_done()
        logging.info('Using VEP processed files: %s', known_args.input_pattern)

    variant_merger = _get_variant_merge_strategy(known_args)
    pipeline_mode = vcf_to_bq_common.get_pipeline_mode(
        known_args.input_pattern, known_args.optimize_for_large_inputs)

    # Starts a pipeline to merge VCF headers in beam if the total files that
    # match the input pattern exceeds _SMALL_DATA_THRESHOLD
    _merge_headers(known_args, pipeline_args, pipeline_mode)

    # Retrieve merged headers prior to launching the pipeline. This is needed
    # since the BigQuery schema cannot yet be dynamically created based on input.
    # See https://issues.apache.org/jira/browse/BEAM-2801.
    header_fields = vcf_header_parser.get_vcf_headers(
        known_args.representative_header_file)
    counter_factory = metrics_util.CounterFactory()
    processed_variant_factory = processed_variant.ProcessedVariantFactory(
        header_fields, known_args.split_alternate_allele_info_fields,
        known_args.annotation_fields, known_args.use_allele_num,
        known_args.minimal_vep_alt_matching, counter_factory)

    partitioner = None
    if known_args.optimize_for_large_inputs or known_args.partition_config_path:
        partitioner = variant_partition.VariantPartition(
            known_args.partition_config_path)

    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    pipeline = beam.Pipeline(options=beam_pipeline_options)
    variants = _read_variants(pipeline, known_args)
    variants |= 'FilterVariants' >> filter_variants.FilterVariants(
        reference_names=known_args.reference_names)
    if partitioner:
        num_partitions = partitioner.get_num_partitions()
        partitioned_variants = variants | 'PartitionVariants' >> beam.Partition(
            partition_variants.PartitionVariants(partitioner), num_partitions)
        variants = []
        for i in range(num_partitions):
            if partitioner.should_keep_partition(i):
                variants.append(partitioned_variants[i])
            else:
                num_partitions -= 1
    else:
        # By default we don't partition the data, so we have only 1 partition.
        num_partitions = 1
        variants = [variants]

    for i in range(num_partitions):
        if variant_merger:
            variants[i] |= ('MergeVariants' + str(i) >>
                            merge_variants.MergeVariants(variant_merger))
        variants[i] |= (
            'ProcessVaraints' + str(i) >>
            beam.Map(processed_variant_factory.create_processed_variant).\
                with_output_types(processed_variant.ProcessedVariant))
    if partitioner and partitioner.should_flatten():
        variants = [variants | 'FlattenPartitions' >> beam.Flatten()]
        num_partitions = 1

    for i in range(num_partitions):
        table_suffix = ''
        if partitioner and partitioner.get_partition_name(i):
            table_suffix = '_' + partitioner.get_partition_name(i)
        table_name = known_args.output_table + table_suffix
        _ = (
            variants[i] | 'VariantToBigQuery' + table_suffix >>
            variant_to_bigquery.VariantToBigQuery(
                table_name,
                header_fields,
                variant_merger,
                processed_variant_factory,
                append=known_args.append,
                allow_incompatible_records=known_args.
                allow_incompatible_records,
                omit_empty_sample_calls=known_args.omit_empty_sample_calls,
                num_bigquery_write_shards=known_args.num_bigquery_write_shards)
        )

    result = pipeline.run()
    result.wait_until_finish()

    metrics_util.log_all_counters(result)