def step2_export_to_elasticsearch(hc, vds, args):
    if args.start_with_step > 2 or args.stop_after_step < 2 or args.only_export_to_elasticsearch_at_the_end:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 2 - export to elasticsearch ==============================="
    )

    if vds is None or not args.skip_writing_intermediate_vds:
        stop_hail_context(hc)
        hc = create_hail_context()
        vds = read_in_dataset(hc,
                              args.step1_output_vds,
                              dataset_type=args.dataset_type,
                              skip_summary=True,
                              num_partitions=args.cpu_limit)

    export_to_elasticsearch(
        vds,
        args,
        operation=ELASTICSEARCH_UPSERT,
        delete_index_before_exporting=True,
        export_genotypes=True,
        disable_doc_values_for_fields=("sortedTranscriptConsequences", )
        if not bool(args.use_nested_objects_for_vep) else (),
        disable_index_for_fields=("sortedTranscriptConsequences", )
        if not bool(args.use_nested_objects_for_vep) else (),
        run_after_index_exists=(
            lambda: route_index_to_temp_es_cluster(True, args))
        if args.use_temp_loading_nodes else None,
    )

    args.start_with_step = 3  # step 2 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
예제 #2
0
    "Elasticsearch index name. If specified, only this index will be updated.")
p.add_argument(
    "--dataset-path",
    help="(optional) Path of variant callset. If not specified, the original "
    "vcf/vds path from which the data was loaded will be used.")
p.add_argument("--genome-version",
               help="Genome build: 37 or 38",
               choices=["37", "38"])

p.add_argument(
    "--all",
    help="Update all elasticsearch indices. This option is mutually-exclusive "
    "with --index-name, --dataset-path, and --genome-version.",
    action="store_true")

args = p.parse_args()

hc = create_hail_context()

if args.download_latest_clinvar_vcf:
    for genome_version in ["37", "38"]:
        vds = download_and_import_latest_clinvar_vcf(hc, genome_version)
        write_vds(vds, CLINVAR_VDS_PATH.format(genome_version=genome_version))

if args.index_name and not args.all:
    update_dataset(hc, args.index_name, args)
elif args.all:
    update_all_datasets(hc, args)
else:
    p.exit("ERROR: must specify either --index-name or --all")
def step3_add_reference_datasets(hc, vds, args):
    if args.start_with_step > 3 or args.stop_after_step < 3:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 3 - add reference datasets ==============================="
    )

    if vds is None or not args.skip_writing_intermediate_vds:
        stop_hail_context(hc)
        hc = create_hail_context()
        vds = read_in_dataset(hc,
                              args.step1_output_vds,
                              dataset_type=args.dataset_type,
                              skip_summary=True)

    if not args.only_export_to_elasticsearch_at_the_end:

        vds = compute_minimal_schema(vds, args.dataset_type)

    if args.dataset_type == "VARIANTS":
        # annotate with the combined reference data file which was generated using
        # ../download_and_create_reference_datasets/v01/hail_scripts/combine_all_variant_level_reference_data.py
        # and contains all these annotations in one .vds

        if not (args.exclude_dbnsfp or args.exclude_cadd or args.exclude_1kg
                or args.exclude_exac or args.exclude_topmed or args.exclude_mpc
                or args.exclude_gnomad or args.exclude_eigen
                or args.exclude_primate_ai or args.exclude_splice_ai):

            logger.info("\n==> add combined variant-level reference data")
            vds = add_combined_reference_data_to_vds(
                hc, vds, args.genome_version, subset=args.filter_interval)

        else:
            # annotate with each reference data file - one-by-one
            if not args.skip_annotations and not args.exclude_dbnsfp:
                logger.info("\n==> add dbnsfp")
                vds = add_dbnsfp_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        root="va.dbnsfp",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_cadd:
                logger.info("\n==> add cadd")
                vds = add_cadd_to_vds(hc,
                                      vds,
                                      args.genome_version,
                                      root="va.cadd",
                                      subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_1kg:
                logger.info("\n==> add 1kg")
                vds = add_1kg_phase3_to_vds(hc,
                                            vds,
                                            args.genome_version,
                                            root="va.g1k",
                                            subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_exac:
                logger.info("\n==> add exac")
                vds = add_exac_to_vds(hc,
                                      vds,
                                      args.genome_version,
                                      root="va.exac",
                                      subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_topmed:
                logger.info("\n==> add topmed")
                vds = add_topmed_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        root="va.topmed",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_mpc:
                logger.info("\n==> add mpc")
                vds = add_mpc_to_vds(hc,
                                     vds,
                                     args.genome_version,
                                     root="va.mpc",
                                     subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_gnomad:
                logger.info("\n==> add gnomad exomes")
                vds = add_gnomad_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        exomes_or_genomes="exomes",
                                        root="va.gnomad_exomes",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_gnomad:
                logger.info("\n==> add gnomad genomes")
                vds = add_gnomad_to_vds(hc,
                                        vds,
                                        args.genome_version,
                                        exomes_or_genomes="genomes",
                                        root="va.gnomad_genomes",
                                        subset=args.filter_interval)

            if not args.skip_annotations and not args.exclude_eigen:
                logger.info("\n==> add eigen")
                vds = add_eigen_to_vds(hc,
                                       vds,
                                       args.genome_version,
                                       root="va.eigen",
                                       subset=args.filter_interval)

            if not args.exclude_primate_ai:
                logger.info("\n==> add primate_ai")
                vds = add_primate_ai_to_vds(hc,
                                            vds,
                                            args.genome_version,
                                            root="va.primate_ai",
                                            subset=args.filter_interval)

            if not args.exclude_splice_ai:
                logger.info("\n==> add splice_ai")
                vds = add_splice_ai_to_vds(hc,
                                           vds,
                                           args.genome_version,
                                           root="va.splice_ai",
                                           subset=args.filter_interval)

    if not args.skip_annotations and not args.exclude_clinvar:
        logger.info("\n==> add clinvar")
        vds = add_clinvar_to_vds(hc,
                                 vds,
                                 args.genome_version,
                                 root="va.clinvar",
                                 subset=args.filter_interval)

    if not args.skip_annotations and not args.exclude_hgmd:
        logger.info("\n==> add hgmd")
        vds = add_hgmd_to_vds(hc,
                              vds,
                              args.genome_version,
                              root="va.hgmd",
                              subset=args.filter_interval)

    if not args.is_running_locally and not args.skip_writing_intermediate_vds:
        write_vds(vds, args.step3_output_vds)

    args.start_with_step = 4  # step 3 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
def step1_compute_derived_fields(hc, vds, args):
    if args.start_with_step > 1 or args.stop_after_step < 1:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 1 - compute derived fields ==============================="
    )

    if vds is None or not args.skip_writing_intermediate_vds:
        stop_hail_context(hc)
        hc = create_hail_context()
        vds = read_in_dataset(hc,
                              args.step0_output_vds,
                              dataset_type=args.dataset_type,
                              skip_summary=True,
                              num_partitions=args.cpu_limit)

    parallel_computed_annotation_exprs = [
        "va.docId = %s" % get_expr_for_variant_id(512),
        "va.variantId = %s" % get_expr_for_variant_id(),
        "va.variantType= %s" % get_expr_for_variant_type(),
        "va.contig = %s" % get_expr_for_contig(),
        "va.pos = %s" % get_expr_for_start_pos(),
        "va.start = %s" % get_expr_for_start_pos(),
        "va.end = %s" % get_expr_for_end_pos(),
        "va.ref = %s" % get_expr_for_ref_allele(),
        "va.alt = %s" % get_expr_for_alt_allele(),
        "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
        "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
        "va.sortedTranscriptConsequences = %s" %
        get_expr_for_vep_sorted_transcript_consequences_array(
            vep_root="va.vep",
            include_coding_annotations=True,
            add_transcript_rank=bool(args.use_nested_objects_for_vep)),
    ]

    if args.dataset_type == "VARIANTS":
        FAF_CONFIDENCE_INTERVAL = 0.95  # based on https://macarthurlab.slack.com/archives/C027LHMPP/p1528132141000430

        parallel_computed_annotation_exprs += [
            "va.FAF = %s" % get_expr_for_filtering_allele_frequency(
                "va.info.AC[va.aIndex - 1]", "va.info.AN",
                FAF_CONFIDENCE_INTERVAL),
        ]

    serial_computed_annotation_exprs = [
        "va.xstop = %s" %
        get_expr_for_xpos(field_prefix="va.", pos_field="end"),
        "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.domains = %s" % get_expr_for_vep_protein_domains_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.transcriptConsequenceTerms = %s" %
        get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.mainTranscript = %s" %
        get_expr_for_worst_transcript_consequence_annotations_struct(
            "va.sortedTranscriptConsequences"),
        "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.codingGeneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences",
            only_coding_genes=True),
    ]

    # serial_computed_annotation_exprs += [
    #   "va.sortedTranscriptConsequences = va.sortedTranscriptConsequences.map(c => drop(c, amino_acids, biotype))"
    #]

    if not bool(args.use_nested_objects_for_vep):
        serial_computed_annotation_exprs += [
            "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)"
        ]

    vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)

    for expr in serial_computed_annotation_exprs:
        vds = vds.annotate_variants_expr(expr)

    pprint(vds.variant_schema)

    INPUT_SCHEMA = {}
    if args.dataset_type == "VARIANTS":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            variantId: String,
            originalAltAlleles: Set[String],

            contig: String,
            start: Int,
            pos: Int,
            end: Int,
            ref: String,
            alt: String,

            xpos: Long,
            xstart: Long,
            xstop: Long,

            rsid: String,
            --- qual: Double,
            filters: Set[String],
            aIndex: Int,

            geneIds: Set[String],
            transcriptIds: Set[String],
            codingGeneIds: Set[String],
            domains: Set[String],
            transcriptConsequenceTerms: Set[String],
            sortedTranscriptConsequences: String,
            mainTranscript: Struct,
        """

        if args.not_gatk_genotypes:
            INPUT_SCHEMA["info_fields"] = """
                AC: Array[Int],
                AF: Array[Double],
                AN: Int,
                --- BaseQRankSum: Double,
                --- ClippingRankSum: Double,
                --- DP: Int,
                --- FS: Double,
                --- InbreedingCoeff: Double,
                --- MQ: Double,
                --- MQRankSum: Double,
                --- QD: Double,
                --- ReadPosRankSum: Double,
                --- VQSLOD: Double,
                --- culprit: String,
            """
        else:
            INPUT_SCHEMA["info_fields"] = """
                AC: Array[Int],
                AF: Array[Double],
                AN: Int,
                --- BaseQRankSum: Double,
                --- ClippingRankSum: Double,
                --- DP: Int,
                --- FS: Double,
                --- InbreedingCoeff: Double,
                --- MQ: Double,
                --- MQRankSum: Double,
                --- QD: Double,
                --- ReadPosRankSum: Double,
                --- VQSLOD: Double,
                --- culprit: String,
            """
    elif args.dataset_type == "SV":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            variantId: String,

            contig: String,
            start: Int,
            pos: Int,
            end: Int,
            ref: String,
            alt: String,

            xpos: Long,
            xstart: Long,
            xstop: Long,

            rsid: String,
            --- qual: Double,
            filters: Set[String],
            aIndex: Int,
            
            geneIds: Set[String],
            transcriptIds: Set[String],
            codingGeneIds: Set[String],
            domains: Set[String],
            transcriptConsequenceTerms: Set[String],
            sortedTranscriptConsequences: String,
            mainTranscript: Struct,
        """

        # END=100371979;SVTYPE=DEL;SVLEN=-70;CIGAR=1M70D	GT:FT:GQ:PL:PR:SR
        INPUT_SCHEMA["info_fields"] = """
            IMPRECISE: Boolean,
            SVTYPE: String,
            SVLEN: Int,
            END: Int,
            --- OCC: Int,
            --- FRQ: Double,
        """
    else:
        raise ValueError("Unexpected dataset_type: %s" % args.dataset_type)

    if args.exclude_vcf_info_field:
        INPUT_SCHEMA["info_fields"] = ""

    expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean",
                                                               **INPUT_SCHEMA)

    vds = vds.annotate_variants_expr(expr=expr)
    vds = vds.annotate_variants_expr("va = va.clean")

    if not args.skip_writing_intermediate_vds:
        write_vds(vds, args.step1_output_vds)

    args.start_with_step = 2  # step 1 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
def run_pipeline():
    args = init_command_line_args()

    # compute additional derived params and add them to the args object for convenience
    args.output_vds_prefix = compute_output_vds_prefix(args)

    args.step0_output_vcf = args.output_vds_prefix + (
        ".vep.vcf.bgz" if ".vep" not in args.output_vds_prefix
        and not args.skip_vep else ".vcf.bgz")
    args.step0_output_vds = args.output_vds_prefix + (
        ".vep.vds" if ".vep" not in args.output_vds_prefix
        and not args.skip_vep else ".vds")
    args.step1_output_vds = args.output_vds_prefix + ".vep_and_computed_annotations.vds"
    args.step3_output_vds = args.output_vds_prefix + ".vep_and_all_annotations.vds"

    hc = create_hail_context()

    args.is_running_locally = hc.sc.master.startswith(
        "local")  # is the pipeline is running locally or on dataproc
    logger.info("is_running_locally = %s", args.is_running_locally)

    # pipeline steps
    vds = None
    hc, vds = step0_init_and_run_vep(hc, vds, args)
    hc, vds = step1_compute_derived_fields(hc, vds, args)
    hc, vds = step2_export_to_elasticsearch(hc, vds, args)
    hc, vds = step3_add_reference_datasets(hc, vds, args)
    hc, vds = step4_export_to_elasticsearch(hc, vds, args)

    if args.stop_after_step > 4:
        update_operations_log(args)
        cleanup_steps(args)

        if args.use_temp_loading_nodes:
            # move data off of the loading nodes
            route_index_to_temp_es_cluster(False, args)

    logger.info("==> Pipeline completed")
    logger.info("")
    logger.info("")
    logger.info("")
    logger.info(
        "========================================================================================================"
    )
    logger.info("")
    logger.info(
        "==> To add this dataset to a seqr project, click 'Edit Datasets' on the seqr project page "
    )
    logger.info(
        "    (https://seqr.broadinstitute.org/project/{}/project_page)  and enter: "
        .format(args.project_guid))
    logger.info("")
    logger.info("        Elasticsearch Index: {} ".format(args.index))
    logger.info("        Sample Type: {} ".format(args.sample_type))
    logger.info("        Dataset Path: {} ".format(args.input_dataset))
    logger.info("")
    logger.info(
        "========================================================================================================"
    )
    logger.info("")
    logger.info("")