Exemplo n.º 1
0
def compute_sex() -> hl.Table:
    # Compute sex chrom poloidy
    ht = impute_sex_ploidy(
        get_gnomad_v3_mt(remove_hard_filtered_samples=False),
        excluded_calling_intervals=telomeres_and_centromeres.ht())
    ht = ht.checkpoint('gs://gnomad-tmp/sex_depth.ht', overwrite=True)

    # Compute F-stat
    chrom_x_ht = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                                  remove_hard_filtered_samples=False)
    n_samples = chrom_x_ht.count_cols()
    chrom_x_ht = hl.filter_intervals(chrom_x_ht,
                                     [hl.parse_locus_interval('chrX')])
    chrom_x_ht = chrom_x_ht.filter_rows((hl.len(chrom_x_ht.alleles) == 2))

    # Use AC / 2*n_samples for AF. This doesn't take missing into account but avoids densifying
    # Should be fine for this purpose.
    info_ht = get_info(split=False).ht()
    info_ht = hl.filter_intervals(info_ht, [hl.parse_locus_interval('chrX')])
    chrom_x_ht = chrom_x_ht.annotate_rows(
        aaf=info_ht[chrom_x_ht.row_key].info.AC[0] / (2 * n_samples))

    inbreeding_ht = hl.impute_sex(chrom_x_ht.LGT,
                                  aaf_threshold=0.001,
                                  aaf='aaf')
    ht = ht.annotate(**inbreeding_ht[ht.key])

    x_ploidy_cutoff, y_ploidy_cutoff = get_ploidy_cutoffs(ht,
                                                          f_stat_cutoff=0.5)

    return ht.annotate(**get_sex_expr(ht.chrX_ploidy, ht.chrY_ploidy,
                                      x_ploidy_cutoff, y_ploidy_cutoff))
Exemplo n.º 2
0
def annotate_sex(
    mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
    is_sparse: bool = True,
    excluded_intervals: Optional[hl.Table] = None,
    included_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    reference_genome: str = "GRCh38",
    sites_ht: Optional[hl.Table] = None,
    aaf_expr: Optional[str] = None,
    gt_expr: str = "GT",
    f_stat_cutoff: float = 0.5,
    aaf_threshold: float = 0.001,
) -> hl.Table:
    """
    Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

    Return Table with the following fields:
        - s (str): Sample
        - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20.
        - chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
        - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
        - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
        - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
        - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
        - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
        - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - X_karyotype (str): Sample's chromosome X karyotype.
        - Y_karyotype (str): Sample's chromosome Y karyotype.
        - sex_karyotype (str): Sample's sex karyotype.

    :param mtds: Input MatrixTable or VariantDataset
    :param bool is_sparse: Whether input MatrixTable is in sparse data format
    :param excluded_intervals: Optional table of intervals to exclude from the computation.
    :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
    :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
    :param reference_genome: Reference genome used for constructing interval list. Default: 'GRCh38'
    :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
                    and pulls alternate allele frequency from this Table.
    :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
    :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
    :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
    :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
    :return: Table of samples and their imputed sex karyotypes.
    """
    logger.info("Imputing sex chromosome ploidies...")

    is_vds = isinstance(mtds, hl.vds.VariantDataset)
    if is_vds:
        if excluded_intervals is not None:
            raise NotImplementedError(
                "excluded_intervals is not used when imputing sex chromosome ploidy for VDS"
            )
        ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
            mtds,
            calling_intervals=included_intervals,
            normalization_contig=normalization_contig,
        )
        ploidy_ht = ploidy_ht.rename(
            {"x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy"}
        )
        mt = mtds.variant_data
    else:
        mt = mtds
        if is_sparse:
            ploidy_ht = impute_sex_ploidy(
                mt, excluded_intervals, included_intervals, normalization_contig
            )
        else:
            raise NotImplementedError(
                "Imputing sex ploidy does not exist yet for dense data."
            )

    x_contigs = get_reference_genome(mt.locus).x_contigs
    logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs)
    if "was_split" in list(mt.row):
        mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    else:
        mt = mt.filter_rows(
            (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1])
        )

    mt = hl.filter_intervals(
        mt,
        [
            hl.parse_locus_interval(contig, reference_genome=reference_genome)
            for contig in x_contigs
        ],
        keep=True,
    )

    if sites_ht is not None:
        if aaf_expr == None:
            logger.warning(
                "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'."
            )
            aaf_expr = "AF"
        logger.info("Filtering to provided sites")
        mt = mt.annotate_rows(**sites_ht[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))

    logger.info("Calculating inbreeding coefficient on chrX")
    sex_ht = hl.impute_sex(
        mt[gt_expr],
        aaf_threshold=aaf_threshold,
        male_threshold=f_stat_cutoff,
        female_threshold=f_stat_cutoff,
        aaf=aaf_expr,
    )

    logger.info("Annotating sex ht with sex chromosome ploidies")
    sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])

    logger.info("Inferring sex karyotypes")
    x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(sex_ht, f_stat_cutoff)
    sex_ht = sex_ht.annotate_globals(
        x_ploidy_cutoffs=hl.struct(
            upper_cutoff_X=x_ploidy_cutoffs[0],
            lower_cutoff_XX=x_ploidy_cutoffs[1][0],
            upper_cutoff_XX=x_ploidy_cutoffs[1][1],
            lower_cutoff_XXX=x_ploidy_cutoffs[2],
        ),
        y_ploidy_cutoffs=hl.struct(
            lower_cutoff_Y=y_ploidy_cutoffs[0][0],
            upper_cutoff_Y=y_ploidy_cutoffs[0][1],
            lower_cutoff_YY=y_ploidy_cutoffs[1],
        ),
        f_stat_cutoff=f_stat_cutoff,
    )
    return sex_ht.annotate(
        **get_sex_expr(
            sex_ht.chrX_ploidy, sex_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs
        )
    )
Exemplo n.º 3
0
def main(args):
    hl.init(log='/hail.log', default_reference='GRCh38')

    if args.sample_qc:
        compute_sample_qc().write(get_sample_qc().path,
                                  overwrite=args.overwrite)

    if args.compute_qc_mt:
        compute_qc_mt().write(v3_qc.path, overwrite=args.overwrite)

    if args.impute_sex:
        compute_sex().write(v3_sex.path, overwrite=args.overwrite)
    elif args.reannotate_sex:
        sex_ht = v3_sex.ht().checkpoint(
            'gs://gnomad-tmp/sex_ht_checkpoint.ht',
            overwrite=True)  # Copy HT to temp location to overwrite annotation
        x_ploidy_cutoff, y_ploidy_cutoff = get_ploidy_cutoffs(
            sex_ht, f_stat_cutoff=0.5)
        sex_ht = sex_ht.annotate(
            **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy,
                           x_ploidy_cutoff, y_ploidy_cutoff))
        sex_ht.write(v3_sex.path, overwrite=args.overwrite)

    if args.compute_hard_filters:
        compute_hard_filters(args.min_cov).write(hard_filtered_samples.path,
                                                 overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PC-Relate')
        logger.warn(
            "PC-relate requires SSDs and doesn't work with preemptible workers!"
        )
        qc_mt = v3_qc.mt()
        eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores = scores.checkpoint(v3_pc_relate_pca_scores.path,
                                   overwrite=args.overwrite,
                                   _read_if_exists=not args.overwrite)
        relatedness_ht = hl.pc_relate(qc_mt.GT,
                                      min_individual_maf=0.01,
                                      scores_expr=scores[qc_mt.col_key].scores,
                                      block_size=4096,
                                      min_kinship=0.05,
                                      statistics='all')
        relatedness_ht.write(v3_relatedness.path, args.overwrite)

    if args.run_pca:
        rank_ht = compute_sample_rankings(
            use_qc_metrics_filters=False
        )  # QC metrics filters do not exist at this point
        rank_ht = rank_ht.checkpoint(pca_samples_rankings.path,
                                     overwrite=args.overwrite,
                                     _read_if_exists=not args.overwrite)
        filtered_samples = hl.literal(
            rank_ht.aggregate(
                hl.agg.filter(rank_ht.filtered,
                              hl.agg.collect_as_set(rank_ht.s)))
        )  # TODO: don't localize once hail bug is fixed
        samples_to_drop = compute_related_samples_to_drop(
            v3_relatedness.ht(),
            rank_ht,
            args.kin_threshold,
            filtered_samples=filtered_samples)
        samples_to_drop.checkpoint(pca_related_samples_to_drop.path,
                                   overwrite=args.overwrite,
                                   _read_if_exists=not args.overwrite)
        pop_pca_eignevalues, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca(
            args.include_unreleasable_samples, args.n_pcs, samples_to_drop)
        pop_pca_scores_ht.write(get_ancestry_pca_scores(
            args.include_unreleasable_samples).path,
                                overwrite=args.overwrite)
        pop_pca_loadings_ht.write(get_ancestry_pca_loadings(
            args.include_unreleasable_samples).path,
                                  overwrite=args.overwrite)
        with hl.utils.hadoop_open(get_ancestry_pca_eigenvalues_path(
                args.include_unreleasable_samples),
                                  mode='w') as f:
            f.write(",".join([str(x) for x in pop_pca_eignevalues]))

    if args.assign_pops:
        pop_ht, pops_rf_model = assign_pops(args.min_pop_prob,
                                            args.include_unreleasable_samples)
        pop_ht = pop_ht.checkpoint(pop.path,
                                   overwrite=args.overwrite,
                                   _read_if_exists=not args.overwrite)
        pop_ht.transmute(
            **{f'PC{i + 1}': pop_ht.pca_scores[i]
               for i in range(0, 10)}).export(pop_tsv_path)

        with hl.hadoop_open(pop_rf_path, 'wb') as out:
            pickle.dump(pops_rf_model, out)

    if args.apply_stratified_filters:
        apply_stratified_filters(args.filtering_qc_metrics.split(",")).write(
            stratified_metrics.path, overwrite=args.overwrite)

    if args.apply_regressed_filters:
        apply_regressed_filters(args.filtering_qc_metrics.split(","),
                                args.include_unreleasable_samples).write(
                                    regressed_metrics.path,
                                    overwrite=args.overwrite)

    if args.compute_related_samples_to_drop:
        rank_ht = compute_sample_rankings(use_qc_metrics_filters=True)
        rank_ht = rank_ht.checkpoint(release_samples_rankings.path,
                                     overwrite=args.overwrite,
                                     _read_if_exists=not args.overwrite)
        filtered_samples = hl.literal(
            rank_ht.aggregate(
                hl.agg.filter(rank_ht.filtered,
                              hl.agg.collect_as_set(rank_ht.s)))
        )  # TODO: don't localize once hail bug is fixed
        print(filtered_samples)
        samples_to_drop = compute_related_samples_to_drop(
            v3_relatedness.ht(),
            rank_ht,
            args.kin_threshold,
            filtered_samples=filtered_samples)
        samples_to_drop.write(release_related_samples_to_drop.path,
                              overwrite=args.overwrite)

    if args.generate_metadata:
        meta_ht = generate_metadata()
        meta_ht.checkpoint(meta.path,
                           overwrite=args.overwrite,
                           _read_if_exists=not args.overwrite)
        n_pcs = meta_ht.aggregate(hl.agg.min(hl.len(meta_ht.pca_scores)))
        meta_ht = meta_ht.transmute(**{
            f'PC{i + 1}': meta_ht.pca_scores[i]
            for i in range(n_pcs)
        },
                                    hard_filters=hl.or_missing(
                                        hl.len(meta_ht.hard_filters) > 0,
                                        hl.delimit(meta_ht.hard_filters)),
                                    qc_metrics_filters=hl.or_missing(
                                        hl.len(meta_ht.qc_metrics_filters) > 0,
                                        hl.delimit(
                                            meta_ht.qc_metrics_filters)))
        meta_ht.flatten().export(meta_tsv_path)
Exemplo n.º 4
0
def annotate_sex(
    mtds: Union[hl.MatrixTable, hl.vds.VariantDataset],
    is_sparse: bool = True,
    excluded_intervals: Optional[hl.Table] = None,
    included_intervals: Optional[hl.Table] = None,
    normalization_contig: str = "chr20",
    sites_ht: Optional[hl.Table] = None,
    aaf_expr: Optional[str] = None,
    gt_expr: str = "GT",
    f_stat_cutoff: float = 0.5,
    aaf_threshold: float = 0.001,
    variants_only_x_ploidy: bool = False,
    variants_only_y_ploidy: bool = False,
) -> hl.Table:
    """
    Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.

    Return Table with the following fields:
        - s (str): Sample
        - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`.
        - chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
        - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
        - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
        - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
        - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
        - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
        - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
        - X_karyotype (str): Sample's chromosome X karyotype.
        - Y_karyotype (str): Sample's chromosome Y karyotype.
        - sex_karyotype (str): Sample's sex karyotype.

    :param mtds: Input MatrixTable or VariantDataset
    :param bool is_sparse: Whether input MatrixTable is in sparse data format
    :param excluded_intervals: Optional table of intervals to exclude from the computation.
    :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
    :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
    :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
                    and pulls alternate allele frequency from this Table.
    :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
    :param gt_expr: Name of entry field storing the genotype. Default: 'GT'
    :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
    :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
    :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation.
    :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation.
    :return: Table of samples and their imputed sex karyotypes.
    """
    logger.info("Imputing sex chromosome ploidies...")

    is_vds = isinstance(mtds, hl.vds.VariantDataset)
    if is_vds:
        if excluded_intervals is not None:
            raise NotImplementedError(
                "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!"
            )
        # Begin by creating a ploidy estimate HT using the method defined by 'variants_only_x_ploidy'
        ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
            mtds,
            calling_intervals=included_intervals,
            normalization_contig=normalization_contig,
            use_variant_dataset=variants_only_x_ploidy,
        )
        ploidy_ht = ploidy_ht.rename({
            "x_ploidy":
            "chrX_ploidy",
            "y_ploidy":
            "chrY_ploidy",
            "x_mean_dp":
            "chrX_mean_dp",
            "y_mean_dp":
            "chrY_mean_dp",
            "autosomal_mean_dp":
            f"var_data_{normalization_contig}_mean_dp"
            if variants_only_x_ploidy else f"{normalization_contig}_mean_dp",
        })
        # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation using
        # the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
        if variants_only_y_ploidy != variants_only_x_ploidy:
            y_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy(
                mtds,
                calling_intervals=included_intervals,
                normalization_contig=normalization_contig,
                use_variant_dataset=variants_only_y_ploidy,
            )
            y_ploidy_idx = y_ploidy_ht[ploidy_ht.key]
            ploidy_ht = ploidy_ht.annotate(
                chrY_ploidy=y_ploidy_idx.y_ploidy,
                chrY_mean_dp=y_ploidy_idx.y_mean_dp,
            )

            # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
            # that this is the variant dataset only mean DP (this will have already been added if
            # 'variants_only_x_ploidy' was also True).
            if variants_only_y_ploidy:
                ploidy_ht = ploidy_ht.annotate(
                    **{
                        f"var_data_{normalization_contig}_mean_dp":
                        y_ploidy_idx.autosomal_mean_dp
                    })

        mt = mtds.variant_data
    else:
        mt = mtds
        if is_sparse:
            ploidy_ht = impute_sex_ploidy(
                mt,
                excluded_intervals,
                included_intervals,
                normalization_contig,
                use_only_variants=variants_only_x_ploidy,
            )
            ploidy_ht = ploidy_ht.rename({
                "autosomal_mean_dp":
                f"var_data_{normalization_contig}_mean_dp" if
                variants_only_x_ploidy else f"{normalization_contig}_mean_dp",
            })
            # If 'variants_only_y_ploidy' is different from 'variants_only_x_ploidy' then re-run the ploidy estimation
            # using the method defined by 'variants_only_y_ploidy' and re-annotate with the modified ploidy estimates.
            if variants_only_y_ploidy != variants_only_x_ploidy:
                y_ploidy_ht = impute_sex_ploidy(
                    mt,
                    excluded_intervals,
                    included_intervals,
                    normalization_contig,
                    use_only_variants=variants_only_y_ploidy,
                )
                y_ploidy_ht.select(
                    "chrY_ploidy",
                    "chrY_mean_dp",
                    f"{normalization_contig}_mean_dp",
                )
                # If the `variants_only_y_ploidy' is True modify the name of the normalization contig mean DP to indicate
                # that this is the variant dataset only mean DP (this will have already been added if
                # 'variants_only_x_ploidy' was also True).
                if variants_only_y_ploidy:
                    ploidy_ht = ploidy_ht.rename({
                        f"{normalization_contig}_mean_dp":
                        f"var_data_{normalization_contig}_mean_dp"
                    })
                # Re-annotate the ploidy HT with modified Y ploidy annotations
                ploidy_ht = ploidy_ht.annotate(**y_ploidy_ht[ploidy_ht.key])

        else:
            raise NotImplementedError(
                "Imputing sex ploidy does not exist yet for dense data.")

    x_contigs = get_reference_genome(mt.locus).x_contigs
    logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs)
    if "was_split" in list(mt.row):
        mt = mt.filter_rows((~mt.was_split)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))
    else:
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1]))

    build = get_reference_genome(mt.locus).name
    mt = hl.filter_intervals(
        mt,
        [
            hl.parse_locus_interval(contig, reference_genome=build)
            for contig in x_contigs
        ],
        keep=True,
    )

    if sites_ht is not None:
        if aaf_expr == None:
            logger.warning(
                "sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'."
            )
            aaf_expr = "AF"
        logger.info("Filtering to provided sites")
        mt = mt.annotate_rows(**sites_ht[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))

    logger.info("Calculating inbreeding coefficient on chrX")
    sex_ht = hl.impute_sex(
        mt[gt_expr],
        aaf_threshold=aaf_threshold,
        male_threshold=f_stat_cutoff,
        female_threshold=f_stat_cutoff,
        aaf=aaf_expr,
    )

    logger.info("Annotating sex ht with sex chromosome ploidies")
    sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])

    logger.info("Inferring sex karyotypes")
    x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(
        sex_ht, f_stat_cutoff)
    sex_ht = sex_ht.annotate_globals(
        x_ploidy_cutoffs=hl.struct(
            upper_cutoff_X=x_ploidy_cutoffs[0],
            lower_cutoff_XX=x_ploidy_cutoffs[1][0],
            upper_cutoff_XX=x_ploidy_cutoffs[1][1],
            lower_cutoff_XXX=x_ploidy_cutoffs[2],
        ),
        y_ploidy_cutoffs=hl.struct(
            lower_cutoff_Y=y_ploidy_cutoffs[0][0],
            upper_cutoff_Y=y_ploidy_cutoffs[0][1],
            lower_cutoff_YY=y_ploidy_cutoffs[1],
        ),
        f_stat_cutoff=f_stat_cutoff,
        variants_only_x_ploidy=variants_only_x_ploidy,
        variants_only_y_ploidy=variants_only_y_ploidy,
    )
    return sex_ht.annotate(
        **get_sex_expr(sex_ht.chrX_ploidy, sex_ht.chrY_ploidy,
                       x_ploidy_cutoffs, y_ploidy_cutoffs))