Пример #1
0
def add_gnomad_to_vds(hail_context,
                      vds,
                      genome_version,
                      exomes_or_genomes,
                      root=None,
                      top_level_fields=USEFUL_TOP_LEVEL_FIELDS,
                      info_fields=USEFUL_INFO_FIELDS,
                      subset=None,
                      verbose=True):
    if genome_version not in ("37", "38"):
        raise ValueError("Invalid genome_version: %s. Must be '37' or '38'" %
                         str(genome_version))

    if exomes_or_genomes not in ("exomes", "genomes"):
        raise ValueError(
            "Invalid genome_version: %s. Must be 'exomes' or 'genomes'" %
            str(genome_version))

    if root is None:
        root = "va.gnomad_%s" % exomes_or_genomes

    gnomad_vds = read_gnomad_vds(hail_context,
                                 genome_version,
                                 exomes_or_genomes,
                                 subset=subset)

    if exomes_or_genomes == "genomes":
        # remove any *SAS* fields from genomes since South Asian population only defined for exomes
        info_fields = "\n".join(field for field in info_fields.split("\n")
                                if "SAS" not in field)

    top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=top_level_fields,
        other_source_root="vds",
    )
    if verbose:
        print(top_fields_expr)

    info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )
    if verbose:
        print(info_fields_expr)

    expr = []
    if top_fields_expr:
        expr.append(top_fields_expr)
    if info_fields_expr:
        expr.append(info_fields_expr)
    return (vds.annotate_variants_vds(gnomad_vds, expr=", ".join(expr)))
def compute_minimal_schema(vds, dataset_type="VARIANTS"):

    # add computed annotations
    vds = vds.annotate_variants_expr([
        "va.docId = %s" % get_expr_for_variant_id(512),
    ])

    #pprint(vds.variant_schema)

    # apply schema to dataset
    INPUT_SCHEMA  = {}
    if dataset_type == "VARIANTS":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            aIndex: Int,
        """

        INPUT_SCHEMA["info_fields"] = ""

    elif dataset_type == "SV":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            aIndex: Int,
        """

        INPUT_SCHEMA["info_fields"] = ""

    else:
        raise ValueError("Unexpected dataset_type: %s" % dataset_type)

    expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean", **INPUT_SCHEMA)
    vds = vds.annotate_variants_expr(expr=expr)
    vds = vds.annotate_variants_expr("va = va.clean")

    return vds
def add_dbnsfp_to_vds(hail_context,
                      vds,
                      genome_version,
                      root="va.dbnsfp",
                      subset=None,
                      verbose=True):
    """Add dbNSFP fields to the VDS"""

    if genome_version == "37":
        dbnsfp_schema = DBNSFP_SCHEMA_37
    elif genome_version == "38":
        dbnsfp_schema = DBNSFP_SCHEMA_38
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=dbnsfp_schema,
        other_source_root="vds",
    )

    if verbose:
        print(expr)

    dbnsfp_vds = read_dbnsfp_vds(hail_context, genome_version, subset=subset)

    return vds.annotate_variants_vds(dbnsfp_vds, expr=expr)
Пример #4
0
def add_cadd_to_vds(hail_context,
                    vds,
                    genome_version,
                    root="va.cadd",
                    info_fields=CADD_FIELDS,
                    subset=None,
                    verbose=True):
    """Add CADD scores to the vds"""

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> cadd summary: ")
        #print("\n" + str(cadd_vds.summarize()))

    cadd_vds = read_cadd_vds(hail_context, genome_version, subset=subset)

    vds = vds.annotate_variants_vds(cadd_vds, expr=expr)

    return vds
def add_exac_to_vds(hail_context, vds, genome_version, root="va.exac", top_level_fields=USEFUL_TOP_LEVEL_FIELDS, info_fields=USEFUL_INFO_FIELDS, subset=None, verbose=True):

    exac_vds = read_exac_vds(hail_context, genome_version, subset=subset)

    # ExAC VCF doesn't contain AF fields, so compute them here
    exac_vds = exac_vds.annotate_variants_expr("""
          va.info.AF_AFR = if(va.info.AN_AFR > 0) va.info.AC_AFR[va.aIndex-1] / va.info.AN_AFR else NA:Float,
          va.info.AF_AMR = if(va.info.AN_AMR > 0) va.info.AC_AMR[va.aIndex-1] / va.info.AN_AMR else NA:Float,
          va.info.AF_EAS = if(va.info.AN_EAS > 0) va.info.AC_EAS[va.aIndex-1] / va.info.AN_EAS else NA:Float,
          va.info.AF_FIN = if(va.info.AN_FIN > 0) va.info.AC_FIN[va.aIndex-1] / va.info.AN_FIN else NA:Float,
          va.info.AF_NFE = if(va.info.AN_NFE > 0) va.info.AC_NFE[va.aIndex-1] / va.info.AN_NFE else NA:Float,
          va.info.AF_OTH = if(va.info.AN_OTH > 0) va.info.AC_OTH[va.aIndex-1] / va.info.AN_OTH else NA:Float,
          va.info.AF_SAS = if(va.info.AN_SAS > 0) va.info.AC_SAS[va.aIndex-1] / va.info.AN_SAS else NA:Float,
          va.info.AF_POPMAX = if(va.info.AN_POPMAX[va.aIndex-1] != "NA" && va.info.AN_POPMAX[va.aIndex-1].toInt() > 0) va.info.AC_POPMAX[va.aIndex-1].toInt() / va.info.AN_POPMAX[va.aIndex-1].toInt() else NA:Float
    """)

    top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=top_level_fields,
        other_source_root="vds",
    )
    if verbose:
        print(top_fields_expr)

    info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    expr = []
    if top_fields_expr:
        expr.append(top_fields_expr)
    if info_fields_expr:
        expr.append(info_fields_expr)
    vds = (vds
        .annotate_variants_vds(exac_vds, expr=", ".join(expr))
    )

    if verbose:
        print(info_fields_expr)
        pprint(vds.variant_schema)

    return vds
def add_topmed_to_vds(hail_context,
                      vds,
                      genome_version,
                      root="va.topmed",
                      fields=TOPMED_FIELDS,
                      subset=None,
                      verbose=True):
    """Add topmed AC and AF annotations to the vds"""

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)

    topmed_vds = read_topmed_vds(hail_context, genome_version, subset=subset)

    return vds.annotate_variants_vds(topmed_vds, expr=expr)
Пример #7
0
def add_combined_reference_data_to_vds(hail_context,
                                       vds,
                                       genome_version,
                                       fields=COMBINED_REFERENCE_DATA_FIELDS,
                                       subset=None,
                                       verbose=True):
    """Add combined reference data annotations to the vds"""

    combined_reference_data_vds = read_combined_reference_data_vds(
        hail_context, genome_version, subset=subset)

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root="va",
        other_source_fields=fields,
        other_source_root="vds",
    )

    if verbose:
        print(expr)

    return vds.annotate_variants_vds(combined_reference_data_vds, expr=expr)
Пример #8
0
def add_1kg_phase3_to_vds(hail_context,
                          vds,
                          genome_version,
                          root="va.g1k",
                          fields=G1K_FIELDS,
                          subset=None,
                          verbose=True):
    """Add 1000 genome AC and AF annotations to the vds"""

    g1k_vds = read_1kg_phase3_vds(hail_context, genome_version, subset=subset)

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> 1kg summary: ")
        #print("\n" + str(g1k_vds.summarize()))

    return vds.annotate_variants_vds(g1k_vds, expr=expr)
Пример #9
0
def add_mpc_to_vds(hail_context,
                   vds,
                   genome_version,
                   root="va.mpc",
                   info_fields=MPC_INFO_FIELDS,
                   subset=None,
                   verbose=True):
    """Add MPC annotations [Samocha 2017] to the vds"""

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> mpc summary: ")
        #print(mpc_vds.summarize())

    mpc_vds = read_mpc_vds(hail_context, genome_version, subset=subset)

    return vds.annotate_variants_vds(mpc_vds, expr=expr)
def step1_compute_derived_fields(hc, vds, args):
    if args.start_with_step > 1 or args.stop_after_step < 1:
        return hc, vds

    logger.info(
        "\n\n=============================== pipeline - step 1 - compute derived fields ==============================="
    )

    if vds is None or not args.skip_writing_intermediate_vds:
        stop_hail_context(hc)
        hc = create_hail_context()
        vds = read_in_dataset(hc,
                              args.step0_output_vds,
                              dataset_type=args.dataset_type,
                              skip_summary=True,
                              num_partitions=args.cpu_limit)

    parallel_computed_annotation_exprs = [
        "va.docId = %s" % get_expr_for_variant_id(512),
        "va.variantId = %s" % get_expr_for_variant_id(),
        "va.variantType= %s" % get_expr_for_variant_type(),
        "va.contig = %s" % get_expr_for_contig(),
        "va.pos = %s" % get_expr_for_start_pos(),
        "va.start = %s" % get_expr_for_start_pos(),
        "va.end = %s" % get_expr_for_end_pos(),
        "va.ref = %s" % get_expr_for_ref_allele(),
        "va.alt = %s" % get_expr_for_alt_allele(),
        "va.xpos = %s" % get_expr_for_xpos(pos_field="start"),
        "va.xstart = %s" % get_expr_for_xpos(pos_field="start"),
        "va.sortedTranscriptConsequences = %s" %
        get_expr_for_vep_sorted_transcript_consequences_array(
            vep_root="va.vep",
            include_coding_annotations=True,
            add_transcript_rank=bool(args.use_nested_objects_for_vep)),
    ]

    if args.dataset_type == "VARIANTS":
        FAF_CONFIDENCE_INTERVAL = 0.95  # based on https://macarthurlab.slack.com/archives/C027LHMPP/p1528132141000430

        parallel_computed_annotation_exprs += [
            "va.FAF = %s" % get_expr_for_filtering_allele_frequency(
                "va.info.AC[va.aIndex - 1]", "va.info.AN",
                FAF_CONFIDENCE_INTERVAL),
        ]

    serial_computed_annotation_exprs = [
        "va.xstop = %s" %
        get_expr_for_xpos(field_prefix="va.", pos_field="end"),
        "va.transcriptIds = %s" % get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.domains = %s" % get_expr_for_vep_protein_domains_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.transcriptConsequenceTerms = %s" %
        get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.mainTranscript = %s" %
        get_expr_for_worst_transcript_consequence_annotations_struct(
            "va.sortedTranscriptConsequences"),
        "va.geneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences"
        ),
        "va.codingGeneIds = %s" % get_expr_for_vep_gene_ids_set(
            vep_transcript_consequences_root="va.sortedTranscriptConsequences",
            only_coding_genes=True),
    ]

    # serial_computed_annotation_exprs += [
    #   "va.sortedTranscriptConsequences = va.sortedTranscriptConsequences.map(c => drop(c, amino_acids, biotype))"
    #]

    if not bool(args.use_nested_objects_for_vep):
        serial_computed_annotation_exprs += [
            "va.sortedTranscriptConsequences = json(va.sortedTranscriptConsequences)"
        ]

    vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)

    for expr in serial_computed_annotation_exprs:
        vds = vds.annotate_variants_expr(expr)

    pprint(vds.variant_schema)

    INPUT_SCHEMA = {}
    if args.dataset_type == "VARIANTS":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            variantId: String,
            originalAltAlleles: Set[String],

            contig: String,
            start: Int,
            pos: Int,
            end: Int,
            ref: String,
            alt: String,

            xpos: Long,
            xstart: Long,
            xstop: Long,

            rsid: String,
            --- qual: Double,
            filters: Set[String],
            aIndex: Int,

            geneIds: Set[String],
            transcriptIds: Set[String],
            codingGeneIds: Set[String],
            domains: Set[String],
            transcriptConsequenceTerms: Set[String],
            sortedTranscriptConsequences: String,
            mainTranscript: Struct,
        """

        if args.not_gatk_genotypes:
            INPUT_SCHEMA["info_fields"] = """
                AC: Array[Int],
                AF: Array[Double],
                AN: Int,
                --- BaseQRankSum: Double,
                --- ClippingRankSum: Double,
                --- DP: Int,
                --- FS: Double,
                --- InbreedingCoeff: Double,
                --- MQ: Double,
                --- MQRankSum: Double,
                --- QD: Double,
                --- ReadPosRankSum: Double,
                --- VQSLOD: Double,
                --- culprit: String,
            """
        else:
            INPUT_SCHEMA["info_fields"] = """
                AC: Array[Int],
                AF: Array[Double],
                AN: Int,
                --- BaseQRankSum: Double,
                --- ClippingRankSum: Double,
                --- DP: Int,
                --- FS: Double,
                --- InbreedingCoeff: Double,
                --- MQ: Double,
                --- MQRankSum: Double,
                --- QD: Double,
                --- ReadPosRankSum: Double,
                --- VQSLOD: Double,
                --- culprit: String,
            """
    elif args.dataset_type == "SV":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            variantId: String,

            contig: String,
            start: Int,
            pos: Int,
            end: Int,
            ref: String,
            alt: String,

            xpos: Long,
            xstart: Long,
            xstop: Long,

            rsid: String,
            --- qual: Double,
            filters: Set[String],
            aIndex: Int,
            
            geneIds: Set[String],
            transcriptIds: Set[String],
            codingGeneIds: Set[String],
            domains: Set[String],
            transcriptConsequenceTerms: Set[String],
            sortedTranscriptConsequences: String,
            mainTranscript: Struct,
        """

        # END=100371979;SVTYPE=DEL;SVLEN=-70;CIGAR=1M70D	GT:FT:GQ:PL:PR:SR
        INPUT_SCHEMA["info_fields"] = """
            IMPRECISE: Boolean,
            SVTYPE: String,
            SVLEN: Int,
            END: Int,
            --- OCC: Int,
            --- FRQ: Double,
        """
    else:
        raise ValueError("Unexpected dataset_type: %s" % args.dataset_type)

    if args.exclude_vcf_info_field:
        INPUT_SCHEMA["info_fields"] = ""

    expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean",
                                                               **INPUT_SCHEMA)

    vds = vds.annotate_variants_expr(expr=expr)
    vds = vds.annotate_variants_expr("va = va.clean")

    if not args.skip_writing_intermediate_vds:
        write_vds(vds, args.step1_output_vds)

    args.start_with_step = 2  # step 1 finished, so, if an error occurs and it goes to retry, start with the next step

    return hc, vds
        subpoulations = ["AFR", "AMR", "EAS", "NFE"]
        if exomes_or_genomes == "exomes":
            subpoulations.append("SAS")  # only gnomad exomes have SAS defined

        subpopulation_exprs = ", ".join([
            "if(va.info.AN_{subpop} > 2000) va.info.AF_{subpop}[va.aIndex-1] else NA:Double"
            .format(**locals()) for subpop in subpoulations
        ])

        vds = vds.annotate_variants_expr(
            "va.info.AF_POPMAX_OR_GLOBAL = [ va.info.AF[va.aIndex-1], {subpopulation_exprs} ].max()"
            .format(**locals()))

        top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
            root="va.clean",
            other_source_fields=USEFUL_TOP_LEVEL_FIELDS,
            other_source_root="va",
        )
        info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
            root="va.clean.info",
            other_source_fields=USEFUL_INFO_FIELDS,
            other_source_root="va.info",
        )

        expr = []
        if top_fields_expr:
            expr.append(top_fields_expr)
        if info_fields_expr:
            expr.append(info_fields_expr)

        vds = vds.annotate_variants_expr(expr=expr)