예제 #1
0
def get_liftover_genome(t: Union[hl.MatrixTable, hl.Table]) -> list:
    """
    Infers genome build of input data and assumes destination build. Prepares to liftover to destination genome build

    :param t: Input Table or MatrixTable
    :return: List of source build (with liftover chain added) and destination build (with sequence loaded)
    """

    logger.info("Inferring build of input")
    build = get_reference_genome(t.locus).name

    logger.info(
        "Loading reference genomes, adding chain file, and loading fasta sequence for destination build"
    )
    if build == "GRCh38":
        source = hl.get_reference("GRCh38")
        target = hl.get_reference("GRCh37")
        chain = "gs://hail-common/references/grch38_to_grch37.over.chain.gz"
        target.add_sequence(
            "gs://hail-common/references/human_g1k_v37.fasta.gz",
            "gs://hail-common/references/human_g1k_v37.fasta.fai",
        )
    else:
        source = hl.get_reference("GRCh37")
        target = hl.get_reference("GRCh38")
        chain = "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
        target.add_sequence(
            "gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz",
            "gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai",
        )

    source.add_liftover(chain, target)
    return [source, target]
예제 #2
0
def filter_low_conf_regions(
    mt: Union[hl.MatrixTable, hl.Table],
    filter_lcr: bool = True,
    filter_decoy: bool = True,
    filter_segdup: bool = True,
    filter_exome_low_coverage_regions: bool = False,
    high_conf_regions: Optional[List[str]] = None,
) -> Union[hl.MatrixTable, hl.Table]:
    """
    Filters low-confidence regions

    :param mt: MatrixTable or Table to filter
    :param filter_lcr: Whether to filter LCR regions
    :param filter_decoy: Whether to filter decoy regions
    :param filter_segdup: Whether to filter Segdup regions
    :param filter_exome_low_coverage_regions: Whether to filter exome low confidence regions
    :param high_conf_regions: Paths to set of high confidence regions to restrict to (union of regions)
    :return: MatrixTable or Table with low confidence regions removed
    """
    build = get_reference_genome(mt.locus).name
    if build == "GRCh37":
        import gnomad.resources.grch37.reference_data as resources
    elif build == "GRCh38":
        import gnomad.resources.grch38.reference_data as resources

    criteria = []
    if filter_lcr:
        lcr = resources.lcr_intervals.ht()
        criteria.append(hl.is_missing(lcr[mt.locus]))

    if filter_decoy:
        decoy = resources.decoy_intervals.ht()
        criteria.append(hl.is_missing(decoy[mt.locus]))

    if filter_segdup:
        segdup = resources.seg_dup_intervals.ht()
        criteria.append(hl.is_missing(segdup[mt.locus]))

    if filter_exome_low_coverage_regions:
        high_cov = resources.high_coverage_intervals.ht()
        criteria.append(hl.is_missing(high_cov[mt.locus]))

    if high_conf_regions is not None:
        for region in high_conf_regions:
            region = hl.import_locus_intervals(region)
            criteria.append(hl.is_defined(region[mt.locus]))

    if criteria:
        filter_criteria = functools.reduce(operator.iand, criteria)
        if isinstance(mt, hl.MatrixTable):
            mt = mt.filter_rows(filter_criteria)
        else:
            mt = mt.filter(filter_criteria)

    return mt
예제 #3
0
def filter_to_autosomes(
        t: Union[hl.MatrixTable, hl.Table]) -> Union[hl.MatrixTable, hl.Table]:
    """
    Filters the Table or MatrixTable to autosomes only.
    This assumes that the input contains a field named `locus` of type Locus

    :param t: Input MT/HT
    :return:  MT/HT autosomes
    """
    reference = get_reference_genome(t.locus)
    autosomes = hl.parse_locus_interval(
        f"{reference.contigs[0]}-{reference.contigs[21]}",
        reference_genome=reference)
    return hl.filter_intervals(t, [autosomes])
예제 #4
0
def filter_to_autosomes(
    mtds: Union[hl.Table, hl.MatrixTable, hl.vds.VariantDataset]
) -> Union[hl.Table, hl.MatrixTable, hl.vds.VariantDataset]:
    """
    Filter Table, MatrixTable or VariantDataset to autosome contigs only.

    This assumes that the input MT or VDS variant_data MT contains a field named `locus` of type Locus

    :param mtds: Input MatrixTable/Table/VariantDataset
    :return: MatrixTable/Table/VariantDataset subset to autosomes
    """
    if isinstance(mtds, hl.vds.VariantDataset):
        reference = get_reference_genome(mtds.variant_data.locus)
    else:
        reference = get_reference_genome(mtds.locus)
    autosomes = hl.parse_locus_interval(
        f"{reference.contigs[0]}-{reference.contigs[21]}", reference_genome=reference
    )

    if isinstance(mtds, hl.vds.VariantDataset):
        return hl.vds.filter_intervals(mtds, [autosomes], keep=True)
    else:
        return hl.filter_intervals(mtds, [autosomes], keep=True)
예제 #5
0
def get_liftover_genome(
    t: Union[hl.MatrixTable, hl.Table]
) -> Tuple[hl.genetics.ReferenceGenome, hl.genetics.ReferenceGenome]:
    """
    Infer reference genome build of input data and assume destination reference genome build.

    Adds liftover chain to source reference genome and sequence to destination reference genome.
    Returns tuple containing both reference genomes in preparation for liftover.

    :param t: Input Table or MatrixTable.
    :return: Tuple of source reference genome (with liftover chain added)
        and destination reference genome (with sequence loaded)
    """
    logger.info("Inferring reference genome of input...")
    input_build = get_reference_genome(t.locus).name
    source = hl.get_reference(input_build)

    logger.info("Loading fasta sequence for destination build...")
    if input_build == "GRCh38":
        target = hl.get_reference("GRCh37")
        chain = GRCH38_TO_GRCH37_CHAIN

    else:
        target = hl.get_reference("GRCh38")
        chain = GRCH37_to_GRCH38_CHAIN

    logger.info("Adding liftover chain to input build...")
    if source.has_liftover(target):
        logger.warning(
            "Source reference build %s already has a chain file: %s! Using whichever chain has already been added.",
            source.name,
            source._liftovers,
        )
    else:
        source.add_liftover(chain, target)

    return (source, add_reference_sequence(target))
예제 #6
0
def impute_sex_ploidy(
    mt: hl.MatrixTable,
    excluded_calling_intervals: Optional[hl.Table] = None,
    included_calling_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    chr_x: Optional[str] = None,
    chr_y: Optional[str] = None,
) -> hl.Table:
    """
    Imputes sex ploidy from a sparse Matrix Table by normalizing the coverage of chromosomes X and Y using
    the coverage of an autosomal chromosome (by default chr20).

    Coverage is computed using the median block coverage (summed over the block size) and the non-ref coverage at non-ref genotypes.

    :param mt: Input sparse Matrix Table
    :param excluded_calling_intervals: Optional table of intervals to exclude from the computation. 
        Used only when determining contig size (not used when computing chromosome depth).
    :param included_calling_intervals: Optional table of intervals to use in the computation. 
        Used only when determining contig size (not used when computing chromosome depth).
    :param normalization_contig: Which chromosome to normalize by
    :param chr_x: Optional X Chromosome contig name (by default uses the X contig in the reference)
    :param chr_y: Optional Y Chromosome contig name (by default uses the Y contig in the reference)
    :return: Table with mean coverage over chromosomes 20, X and Y and sex chromosomes ploidy based on normalized coverage.
    """

    ref = get_reference_genome(mt.locus, add_sequence=True)
    if chr_x is None:
        if len(ref.x_contigs) != 1:
            raise NotImplementedError(
                "Found {0} X chromosome contigs ({1}) in Genome reference. sparse_impute_sex_ploidy currently only supports a single X chromosome contig. Please use the `chr_x` argument to  specify which X chromosome contig to use ".format(
                    len(ref.x_contigs), ",".join(ref.x_contigs)
                )
            )
        chr_x = ref.x_contigs[0]
    if chr_y is None:
        if len(ref.y_contigs) != 1:
            raise NotImplementedError(
                "Found {0} Y chromosome contigs ({1}) in Genome reference. sparse_impute_sex_ploidy currently only supports a single Y chromosome contig. Please use the `chr_y` argument to  specify which Y chromosome contig to use ".format(
                    len(ref.y_contigs), ",".join(ref.y_contigs)
                )
            )
        chr_y = ref.y_contigs[0]

    def get_contig_size(contig: str) -> int:
        logger.info(f"Working on {contig}")
        contig_ht = hl.utils.range_table(
            ref.contig_length(contig),
            n_partitions=int(ref.contig_length(contig) / 500_000),
        )
        contig_ht = contig_ht.annotate(
            locus=hl.locus(contig=contig, pos=contig_ht.idx + 1, reference_genome=ref)
        )
        contig_ht = contig_ht.filter(contig_ht.locus.sequence_context().lower() != "n")

        if contig in ref.x_contigs:
            contig_ht = contig_ht.filter(contig_ht.locus.in_x_nonpar())
        if contig in ref.y_contigs:
            contig_ht = contig_ht.filter(contig_ht.locus.in_y_nonpar())

        contig_ht = contig_ht.key_by("locus")
        if included_calling_intervals is not None:
            contig_ht = contig_ht.filter(
                hl.is_defined(included_calling_intervals[contig_ht.key])
            )
        if excluded_calling_intervals is not None:
            contig_ht = contig_ht.filter(
                hl.is_missing(excluded_calling_intervals[contig_ht.key])
            )
        contig_size = contig_ht.count()
        logger.info(f"Contig {contig} has {contig_size} bases for coverage.")
        return contig_size

    def get_chr_dp_ann(chrom: str) -> hl.Table:
        contig_size = get_contig_size(chrom)
        chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)])

        if chrom in ref.x_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar())
        if chrom in ref.y_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar())

        return chr_mt.select_cols(
            **{
                f"{chrom}_mean_dp": hl.agg.sum(
                    hl.cond(
                        chr_mt.LGT.is_hom_ref(),
                        chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position),
                        chr_mt.DP,
                    )
                )
                / contig_size
            }
        ).cols()

    normalization_chrom_dp = get_chr_dp_ann(normalization_contig)
    chrX_dp = get_chr_dp_ann(chr_x)
    chrY_dp = get_chr_dp_ann(chr_y)

    ht = normalization_chrom_dp.annotate(
        **chrX_dp[normalization_chrom_dp.key], **chrY_dp[normalization_chrom_dp.key],
    )

    return ht.annotate(
        **{
            f"{chr_x}_ploidy": ht[f"{chr_x}_mean_dp"]
            / (ht[f"{normalization_contig}_mean_dp"] / 2),
            f"{chr_y}_ploidy": ht[f"{chr_y}_mean_dp"]
            / (ht[f"{normalization_contig}_mean_dp"] / 2),
        }
    )
예제 #7
0
def score_bin_agg(
    ht: hl.GroupedTable, fam_stats_ht: hl.Table
) -> Dict[str, hl.expr.Aggregation]:
    """
    Default aggregation function to add aggregations for min/max of score, number of ClinVar variants, number of truth
    variants (omni, mills, hapmap, and kgp_phase1), and family statistics.

    .. note::

        This function uses `ht._parent` to get the origin Table from the GroupedTable for the aggregation

    This can easily be combined with the GroupedTable returned by `compute_grouped_binned_ht`

    Example:

    .. code-block:: python

        binned_ht = create_binned_ht(...)
        grouped_binned_ht = compute_grouped_binned_ht(binned_ht)
        agg_ht = grouped_binned_ht.aggregate(score_bin_agg(**grouped_binned_ht, ...))

    .. note::

        The following annotations should be present:

        In ht:
            - score
            - singleton
            - positive_train_site
            - negative_train_site
            - ac_raw - expected that this is the raw allele count before adj filtering
            - ac - expected that this is the allele count after adj filtering
            - ac_qc_samples_unrelated_raw - allele count before adj filtering for unrelated samples passing sample QC
            - info - struct that includes QD, FS, and MQ in order to add an annotation for fail_hard_filters

        In truth_ht:
            - omni
            - mills
            - hapmap
            - kgp_phase1_hc

        In fam_stats_ht:
            - n_de_novos_adj
            - n_de_novos_raw
            - n_transmitted_raw
            - n_untransmitted_raw

    Automatic aggregations that will be done are:
        - `min_score` - minimun of score annotation per group
        - `max_score` - maiximum of score annotation per group
        - `n` - count of variants per group
        - `n_ins` - count of insertion per group
        - `n_ins` - count of insertion per group
        - `n_del` - count of deletions per group
        - `n_ti` - count of transitions per group
        - `n_tv` - count of trnasversions per group
        - `n_1bp_indel` - count of one base pair indels per group
        - `n_mod3bp_indel` - count of indels with a length divisible by three per group
        - `n_singleton` - count of singletons per group
        - `fail_hard_filters` - count of variants per group with QD < 2 | FS > 60 | MQ < 30
        - `n_vqsr_pos_train` - count of variants that were a VQSR positive train site per group
        - `n_vqsr_neg_train` - count of variants that were a VQSR negative train site per group
        - `n_clinvar` - count of clinvar variants
        - `n_de_novos_singleton_adj` - count of singleton de novo variants after adj filtration
        - `n_de_novo_singleton` - count of raw unfiltered singleton de novo variants
        - `n_de_novos_adj` - count of adj filtered de novo variants
        - `n_de_novos` - count of raw unfiltered de novo variants
        - `n_trans_singletons` - count of transmitted singletons
        - `n_untrans_singletons` - count of untransmitted singletons
        - `n_omni` - count of omni truth variants
        - `n_mills` - count of mills truth variants
        - `n_hapmap` - count of hapmap truth variants
        - `n_kgp_phase1_hc` - count of 1000 genomes phase 1 high confidence truth variants

    :param ht: Table that aggregation will be performed on
    :param fam_stats_ht: Path to family statistics HT
    :return: a dictionary containing aggregations to perform on ht
    """
    # Annotate binned table with the evaluation data
    ht = ht._parent
    indel_length = hl.abs(ht.alleles[0].length() - ht.alleles[1].length())
    # Load external evaluation data
    build = get_reference_genome(ht.locus).name
    clinvar = (
        grch37_resources.reference_data.clinvar
        if build == "GRCh37"
        else grch38_resources.reference_data.clinvar
    ).ht()[ht.key]
    truth_data = (
        grch37_resources.reference_data.get_truth_ht()
        if build == "GRCh37"
        else grch38_resources.reference_data.get_truth_ht()
    )[ht.key]
    fam = fam_stats_ht[ht.key]

    return dict(
        min_score=hl.agg.min(ht.score),
        max_score=hl.agg.max(ht.score),
        n=hl.agg.count(),
        n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])),
        n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])),
        n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])),
        n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])),
        n_1bp_indel=hl.agg.count_where(indel_length == 1),
        n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0),
        n_singleton=hl.agg.count_where(ht.singleton),
        fail_hard_filters=hl.agg.count_where(
            (ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30)
        ),
        n_pos_train=hl.agg.count_where(ht.positive_train_site),
        n_neg_train=hl.agg.count_where(ht.negative_train_site),
        n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)),
        n_de_novos_singleton_adj=hl.agg.filter(
            ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj)
        ),
        n_de_novo_singleton=hl.agg.filter(
            ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
        ),
        n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
        n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
        n_trans_singletons=hl.agg.filter(
            ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_untrans_singletons=hl.agg.filter(
            (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
            hl.agg.sum(fam.n_untransmitted_raw),
        ),
        n_train_trans_singletons=hl.agg.filter(
            (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_omni=hl.agg.count_where(truth_data.omni),
        n_mills=hl.agg.count_where(truth_data.mills),
        n_hapmap=hl.agg.count_where(truth_data.hapmap),
        n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc),
    )
예제 #8
0
def impute_sex_ploidy(
    mt: hl.MatrixTable,
    excluded_calling_intervals: Optional[hl.Table] = None,
    included_calling_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    chr_x: Optional[str] = None,
    chr_y: Optional[str] = None,
    use_only_variants: bool = False,
) -> hl.Table:
    """
    Impute sex ploidy from a sparse MatrixTable.

    Sex ploidy is imputed by normalizing the coverage of chromosomes X and Y using the coverage of an autosomal
    chromosome (by default chr20).

    Coverage is computed using the median block coverage (summed over the block size) and the non-ref coverage at
    non-ref genotypes unless the `use_only_variants` argument is set to True and then it will use the mean coverage
    defined by only the variants.

    :param mt: Input sparse Matrix Table
    :param excluded_calling_intervals: Optional table of intervals to exclude from the computation. Used only when
        determining contig size (not used when computing chromosome depth) when `use_only_variants` is False.
    :param included_calling_intervals: Optional table of intervals to use in the computation. Used only when
        determining contig size (not used when computing chromosome depth) when `use_only_variants` is False.
    :param normalization_contig: Which chromosome to normalize by
    :param chr_x: Optional X Chromosome contig name (by default uses the X contig in the reference)
    :param chr_y: Optional Y Chromosome contig name (by default uses the Y contig in the reference)
    :param use_only_variants: Whether to use depth of variant data within calling intervals instead of reference data.
        Default will only use reference data.

    :return: Table with mean coverage over chromosomes 20, X and Y and sex chromosomes ploidy based on normalized coverage.
    """
    ref = get_reference_genome(mt.locus, add_sequence=True)
    if chr_x is None:
        if len(ref.x_contigs) != 1:
            raise NotImplementedError(
                "Found {0} X chromosome contigs ({1}) in Genome reference. sparse_impute_sex_ploidy currently only supports a single X chromosome contig. Please use the `chr_x` argument to  specify which X chromosome contig to use "
                .format(len(ref.x_contigs), ",".join(ref.x_contigs)))
        chr_x = ref.x_contigs[0]
    if chr_y is None:
        if len(ref.y_contigs) != 1:
            raise NotImplementedError(
                "Found {0} Y chromosome contigs ({1}) in Genome reference. sparse_impute_sex_ploidy currently only supports a single Y chromosome contig. Please use the `chr_y` argument to  specify which Y chromosome contig to use "
                .format(len(ref.y_contigs), ",".join(ref.y_contigs)))
        chr_y = ref.y_contigs[0]

    def get_contig_size(contig: str) -> int:
        """
        Compute the size of the specified `contig` using the median block coverage (summed over the block size).

        The size of the contig will be determined using only non par regions if the contig is an X or Y reference contig
        and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
        `excluded_calling_intervals` if either is defined in the outer function.

        :param contig: Contig to compute the size of
        :return: Integer of the contig size
        """
        logger.info("Working on %s", contig)
        contig_ht = hl.utils.range_table(
            ref.contig_length(contig),
            n_partitions=int(ref.contig_length(contig) / 500_000),
        )
        contig_ht = contig_ht.annotate(locus=hl.locus(
            contig=contig, pos=contig_ht.idx + 1, reference_genome=ref))
        contig_ht = contig_ht.filter(
            contig_ht.locus.sequence_context().lower() != "n")

        if contig in ref.x_contigs:
            contig_ht = contig_ht.filter(contig_ht.locus.in_x_nonpar())
        if contig in ref.y_contigs:
            contig_ht = contig_ht.filter(contig_ht.locus.in_y_nonpar())

        contig_ht = contig_ht.key_by("locus")
        if included_calling_intervals is not None:
            contig_ht = contig_ht.filter(
                hl.is_defined(included_calling_intervals[contig_ht.key]))
        if excluded_calling_intervals is not None:
            contig_ht = contig_ht.filter(
                hl.is_missing(excluded_calling_intervals[contig_ht.key]))
        contig_size = contig_ht.count()
        logger.info("Contig %s has %d bases for coverage.", contig,
                    contig_size)
        return contig_size

    def get_chr_dp_ann(chrom: str) -> hl.Table:
        """
        Compute the mean depth of the specified chromosome.

        The total depth will be determined using the sum DP of either reference and variant data or only variant data
        depending on the value of `use_only_variants` in the outer function.

        If `use_only_variants` is set to False then this value is computed using the median block coverage (summed over
        the block size). If `use_only_variants` is set to True, this value is computed using the sum of DP for  all
        variants divided by the total number of variants.

        The depth calculations will be determined using only non par regions if the contig is an X or Y reference contig
        and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
        `excluded_calling_intervals` if either is defined in the outer function (when `use_only_variants` is not
        set this only applies to the contig size estimate and is not used when computing chromosome depth).

        :param chrom: Chromosome to compute the mean depth of
        :return: Table of a per sample mean depth of `chrom`
        """
        contig_size = get_contig_size(chrom)
        chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)])

        if chrom in ref.x_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar())
        if chrom in ref.y_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar())

        if use_only_variants:
            if included_calling_intervals is not None:
                chr_mt = chr_mt.filter_rows(
                    hl.is_defined(included_calling_intervals[chr_mt.row_key]))
            if excluded_calling_intervals is not None:
                chr_mt = chr_mt.filter_rows(
                    hl.is_missing(excluded_calling_intervals[chr_mt.row_key]))
            return chr_mt.select_cols(
                **{
                    f"{chrom}_mean_dp":
                    hl.agg.filter(
                        chr_mt.LGT.is_non_ref(),
                        hl.agg.sum(chr_mt.DP),
                    ) / hl.agg.filter(chr_mt.LGT.is_non_ref(), hl.agg.count())
                }).cols()
        else:
            return chr_mt.select_cols(
                **{
                    f"{chrom}_mean_dp":
                    hl.agg.sum(
                        hl.if_else(
                            chr_mt.LGT.is_hom_ref(),
                            chr_mt.DP *
                            (1 + chr_mt.END - chr_mt.locus.position),
                            chr_mt.DP,
                        )) / contig_size
                }).cols()

    normalization_chrom_dp = get_chr_dp_ann(normalization_contig)
    chrX_dp = get_chr_dp_ann(chr_x)
    chrY_dp = get_chr_dp_ann(chr_y)

    ht = normalization_chrom_dp.annotate(
        **chrX_dp[normalization_chrom_dp.key],
        **chrY_dp[normalization_chrom_dp.key],
    )

    return ht.annotate(
        **{
            f"{chr_x}_ploidy":
            ht[f"{chr_x}_mean_dp"] /
            (ht[f"{normalization_contig}_mean_dp"] / 2),
            f"{chr_y}_ploidy":
            ht[f"{chr_y}_mean_dp"] /
            (ht[f"{normalization_contig}_mean_dp"] / 2),
        })
예제 #9
0
def annotate_sex(
    mt: hl.MatrixTable,
    is_sparse: bool = True,
    excluded_intervals: Optional[hl.Table] = None,
    included_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    sites_ht: Optional[hl.Table] = None,
    aaf_expr: Optional[str] = None,
    gt_expr: str = "GT",
    f_stat_cutoff: float = 0.5,
    aaf_threshold: float = 0.001,
) -> hl.Table:
    """
    Imputes sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.
 
    Returns Table with the following fields:
        - s (str): Sample
        - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20.
        - chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
        - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
        - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
        - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
        - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
        - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
        - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - X_karyotype (str): Sample's chromosome X karyotype.
        - Y_karyotype (str): Sample's chromosome Y karyotype.
        - sex_karyotype (str): Sample's sex karyotype.

    :param mt: Input MatrixTable
    :param bool is_sparse: Whether input MatrixTable is in sparse data format
    :param excluded_intervals: Optional table of intervals to exclude from the computation.
    :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
    :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
    :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
                    and pulls alternate allele frequency from this Table.
    :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
    :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
    :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
    :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
    :return: Table of samples and their imputed sex karyotypes.
    """
    logger.info("Imputing sex chromosome ploidies...")
    if is_sparse:
        ploidy_ht = impute_sex_ploidy(mt, excluded_intervals,
                                      included_intervals, normalization_contig)
    else:
        raise NotImplementedError(
            "Imputing sex ploidy does not exist yet for dense data.")

    x_contigs = get_reference_genome(mt.locus).x_contigs
    logger.info(f"Filtering mt to biallelic SNPs in X contigs: {x_contigs}")
    if "was_split" in list(mt.row):
        mt = mt.filter_rows((~mt.was_split)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    else:
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval(contig) for contig in x_contigs])

    if sites_ht is not None:
        if aaf_expr == None:
            logger.warning(
                "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'."
            )
            aaf_expr = "AF"
        logger.info("Filtering to provided sites")
        mt = mt.annotate_rows(**sites_ht[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))

    logger.info("Calculating inbreeding coefficient on chrX")
    sex_ht = hl.impute_sex(
        mt[gt_expr],
        aaf_threshold=aaf_threshold,
        male_threshold=f_stat_cutoff,
        female_threshold=f_stat_cutoff,
        aaf=aaf_expr,
    )

    logger.info("Annotating sex ht with sex chromosome ploidies")
    sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])

    logger.info("Inferring sex karyotypes")
    x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(
        sex_ht, f_stat_cutoff)
    sex_ht = sex_ht.annotate_globals(
        x_ploidy_cutoffs=hl.struct(
            upper_cutoff_X=x_ploidy_cutoffs[0],
            lower_cutoff_XX=x_ploidy_cutoffs[1][0],
            upper_cutoff_XX=x_ploidy_cutoffs[1][1],
            lower_cutoff_XXX=x_ploidy_cutoffs[2],
        ),
        y_ploidy_cutoffs=hl.struct(
            lower_cutoff_Y=y_ploidy_cutoffs[0][0],
            upper_cutoff_Y=y_ploidy_cutoffs[0][1],
            lower_cutoff_YY=y_ploidy_cutoffs[1],
        ),
        f_stat_cutoff=f_stat_cutoff,
    )
    return sex_ht.annotate(
        **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy,
                       x_ploidy_cutoffs, y_ploidy_cutoffs))
예제 #10
0
def annotate_sex(
    mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
    is_sparse: bool = True,
    excluded_intervals: Optional[hl.Table] = None,
    included_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    sites_ht: Optional[hl.Table] = None,
    aaf_expr: Optional[str] = None,
    gt_expr: str = "GT",
    f_stat_cutoff: float = 0.5,
    aaf_threshold: float = 0.001,
    variants_only_x_ploidy: bool = False,
    variants_only_y_ploidy: bool = False,
) -> hl.Table:
    """
    Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

    Return Table with the following fields:
        - s (str): Sample
        - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`.
        - chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
        - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
        - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
        - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
        - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
        - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
        - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - X_karyotype (str): Sample's chromosome X karyotype.
        - Y_karyotype (str): Sample's chromosome Y karyotype.
        - sex_karyotype (str): Sample's sex karyotype.

    :param mtds: Input MatrixTable or VariantDataset
    :param bool is_sparse: Whether input MatrixTable is in sparse data format
    :param excluded_intervals: Optional table of intervals to exclude from the computation.
    :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
    :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
    :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
                    and pulls alternate allele frequency from this Table.
    :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
    :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
    :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
    :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
    :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation.
    :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation.
    :return: Table of samples and their imputed sex karyotypes.
    """
    logger.info("Imputing sex chromosome ploidies...")

    is_vds = isinstance(mtds, hl.vds.VariantDataset)
    if is_vds:
        if excluded_intervals is not None:
            raise NotImplementedError(
                "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!"
            )
        # Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy'
        ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
            mtds,
            calling_intervals=included_intervals,
            normalization_contig=normalization_contig,
            use_variant_dataset=variants_only_x_ploidy,
        )
        ploidy_ht = ploidy_ht.rename({
            "x_ploidy":
            "chrX_ploidy",
            "y_ploidy":
            "chrY_ploidy",
            "x_mean_dp":
            "chrX_mean_dp",
            "y_mean_dp":
            "chrY_mean_dp",
            "autosomal_mean_dp":
            f"var_data_{normalization_contig}_mean_dp"
            if variants_only_x_ploidy else f"{normalization_contig}_mean_dp",
        })
        # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using
        # the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
        if variants_only_y_ploidy != variants_only_x_ploidy:
            y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
                mtds,
                calling_intervals=included_intervals,
                normalization_contig=normalization_contig,
                use_variant_dataset=variants_only_y_ploidy,
            )
            y_ploidy_idx = y_ploidy_ht[ploidy_ht.key]
            ploidy_ht = ploidy_ht.annotate(
                chrY_ploidy=y_ploidy_idx.y_ploidy,
                chrY_mean_dp=y_ploidy_idx.y_mean_dp,
            )

            # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
            # that this is the variant dataset only mean DP (this will have already been added if
            # 'variants_only_x_ploidy' was also True).
            if variants_only_y_ploidy:
                ploidy_ht = ploidy_ht.annotate(
                    **{
                        f"var_data_{normalization_contig}_mean_dp":
                        y_ploidy_idx.autosomal_mean_dp
                    })

        mt = mtds.variant_data
    else:
        mt = mtds
        if is_sparse:
            ploidy_ht = impute_sex_ploidy(
                mt,
                excluded_intervals,
                included_intervals,
                normalization_contig,
                use_only_variants=variants_only_x_ploidy,
            )
            ploidy_ht = ploidy_ht.rename({
                "autosomal_mean_dp":
                f"var_data_{normalization_contig}_mean_dp" if
                variants_only_x_ploidy else f"{normalization_contig}_mean_dp",
            })
            # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation
            # using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
            if variants_only_y_ploidy != variants_only_x_ploidy:
                y_ploidy_ht = impute_sex_ploidy(
                    mt,
                    excluded_intervals,
                    included_intervals,
                    normalization_contig,
                    use_only_variants=variants_only_y_ploidy,
                )
                y_ploidy_ht.select(
                    "chrY_ploidy",
                    "chrY_mean_dp",
                    f"{normalization_contig}_mean_dp",
                )
                # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
                # that this is the variant dataset only mean DP (this will have already been added if
                # 'variants_only_x_ploidy' was also True).
                if variants_only_y_ploidy:
                    ploidy_ht = ploidy_ht.rename({
                        f"{normalization_contig}_mean_dp":
                        f"var_data_{normalization_contig}_mean_dp"
                    })
                # Re-annotate the ploidy HT with modified Y ploidy annotations
                ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key])

        else:
            raise NotImplementedError(
                "Imputing sex ploidy does not exist yet for dense data.")

    x_contigs = get_reference_genome(mt.locus).x_contigs
    logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs)
    if "was_split" in list(mt.row):
        mt = mt.filter_rows((~mt.was_split)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    else:
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))

    build = get_reference_genome(mt.locus).name
    mt = hl.filter_intervals(
        mt,
        [
            hl.parse_locus_interval(contig, reference_genome=build)
            for contig in x_contigs
        ],
        keep=True,
    )

    if sites_ht is not None:
        if aaf_expr == None:
            logger.warning(
                "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'."
            )
            aaf_expr = "AF"
        logger.info("Filtering to provided sites")
        mt = mt.annotate_rows(**sites_ht[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))

    logger.info("Calculating inbreeding coefficient on chrX")
    sex_ht = hl.impute_sex(
        mt[gt_expr],
        aaf_threshold=aaf_threshold,
        male_threshold=f_stat_cutoff,
        female_threshold=f_stat_cutoff,
        aaf=aaf_expr,
    )

    logger.info("Annotating sex ht with sex chromosome ploidies")
    sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])

    logger.info("Inferring sex karyotypes")
    x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(
        sex_ht, f_stat_cutoff)
    sex_ht = sex_ht.annotate_globals(
        x_ploidy_cutoffs=hl.struct(
            upper_cutoff_X=x_ploidy_cutoffs[0],
            lower_cutoff_XX=x_ploidy_cutoffs[1][0],
            upper_cutoff_XX=x_ploidy_cutoffs[1][1],
            lower_cutoff_XXX=x_ploidy_cutoffs[2],
        ),
        y_ploidy_cutoffs=hl.struct(
            lower_cutoff_Y=y_ploidy_cutoffs[0][0],
            upper_cutoff_Y=y_ploidy_cutoffs[0][1],
            lower_cutoff_YY=y_ploidy_cutoffs[1],
        ),
        f_stat_cutoff=f_stat_cutoff,
        variants_only_x_ploidy=variants_only_x_ploidy,
        variants_only_y_ploidy=variants_only_y_ploidy,
    )
    return sex_ht.annotate(
        **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy,
                       x_ploidy_cutoffs, y_ploidy_cutoffs))
예제 #11
0
def main(args):
    output_dir = args.output_dir
    output_name = args.output_name
    inferred_sex = args.inferred_sex
    mt_path = args.mt_path
    input_pedigree = args.input_pedigree

    gnomad_ld = args.gnomad_ld
    run_ibd = args.run_ibd
    first_degree_pi_hat = args.first_degree_pi_hat
    grandparent_pi_hat = args.grandparent_pi_hat
    grandparent_ibd1 = args.grandparent_ibd1
    grandparent_ibd2 = args.grandparent_ibd2
    filter_kinship_ht = args.filter_kinship_ht

    logger.info("Reading in inputs...")
    mt = hl.read_matrix_table(mt_path)
    pedigree = hl.import_table(input_pedigree, impute=True)

    # Infer build of the MatrixTable
    build = get_reference_genome(mt.locus).name

    logger.info(
        "Filtering to biallelic SNVs on autosomes and performing LD pruning..."
    )
    mt = filter_rows_for_qc(mt,
                            min_af=0.001,
                            min_callrate=0.99,
                            apply_hard_filters=False)
    mt = ld_prune(mt, build, gnomad_ld)
    out_mt = f"{output_dir}/{output_name}_processed_mt.mt"

    logger.info("Remapping sample names...")
    mt, sex_ht = remap_samples(mt_path, mt, pedigree, inferred_sex)

    mt = mt.checkpoint(out_mt, overwrite=True)

    if run_ibd:
        logger.info("Running identity by descent...")
        ibd_results_ht = hl.identity_by_descent(mt,
                                                maf=mt.AF,
                                                min=0.10,
                                                max=1.0)
        ibd_results_ht = ibd_results_ht.annotate(
            ibd0=ibd_results_ht.ibd.Z0,
            ibd1=ibd_results_ht.ibd.Z1,
            ibd2=ibd_results_ht.ibd.Z2,
            pi_hat=ibd_results_ht.ibd.PI_HAT,
        ).drop("ibs0", "ibs1", "ibs2", "ibd")
        out_ht = f"{output_dir}/{output_name}_ibd_kinship.tsv"
        ibd_results_ht.export(out_ht)

    else:
        logger.warn("Skipping IBD - using previous calculations...")
        if not file_exists(f"{output_dir}/{output_name}_ibd_kinship.tsv"):
            logger.warning(
                "IBD calculation was skipped but no file with previous calculations was found...",
                sample,
            )

    logger.info("Reading in kinship ht...")
    kin_ht = hl.import_table(f"{output_dir}/{output_name}_ibd_kinship.tsv",
                             impute=True)

    # Subset MatrixTable and sex ht to the samples in the pedigree
    mt_subset, sex_ht, expected_samples, vcf_samples = subset_samples(
        mt, pedigree, sex_ht, output_dir, output_name)

    # Subset Table to the samples in the pedigree
    subset = hl.set(expected_samples)
    kin_ht = kin_ht.filter(
        subset.contains(kin_ht.i) | subset.contains(kin_ht.j))

    # Key the Table
    kin_ht = kin_ht.key_by("i", "j")

    # Setup output file
    out_summary = hl.hadoop_open(
        f"{output_dir}/{output_name}_ped_check_summary.txt", "w")

    if filter_kinship_ht:
        logger.info(
            "Filtering kinship table to remove unrelated individuals from analysis..."
        )
        kin_ht = filter_kin_ht(kin_ht, out_summary)

    # Output basic stats
    out_summary.write("Number individuals in pedigree: " +
                      str(len(expected_samples)) + "\n")
    out_summary.write("Number individuals in subset from the VCF: " +
                      str(len(vcf_samples)) + "\n")
    out_summary.write("Number of relationships in the kinship table: " +
                      str(kin_ht.count()) + "\n\n")
    out_summary.close()

    seqr_projects, family_ids, given_sex = write_functional_pedigree(
        input_pedigree, vcf_samples, output_dir, output_name)

    # Compare inferred and given sex
    check_sex(sex_ht, output_dir, output_name)

    kin_ht = add_project_and_family_annotations(kin_ht, seqr_projects,
                                                family_ids)

    logger.info("Writing kinship ht per project...")
    # Output original ht per project
    for project in set(seqr_projects.values()):
        full_ht = kin_ht.filter((kin_ht.seqr_proj_i == project)
                                | (kin_ht.seqr_proj_j == project))
        full_ht.drop("seqr_proj_i", "seqr_proj_j").export(
            f"{output_dir}/{project}/{output_name}_{project}_annotated_kin.txt"
        )