Exemplo n.º 1
0
def ld_prune(input_mt: hl.MatrixTable, build: str,
             gnomad_ld: bool) -> hl.MatrixTable:
    """
    LD prune the MatrixTable.

    :param input_mt: MatrixTable
    :param build: Build for the input MatrixTable
    :param gnomad_ld: Whether or not to use LD data from gnomAD dataset for the pruning step
    :return: ld-pruned MatrixTable
    """
    if gnomad_ld == False:
        mm_pruned = hl.ld_prune(input_mt.GT, r2=0.1)
        input_mt = input_mt.filter_rows(
            hl.is_defined(mm_pruned[input_mt.row_key]))
    else:
        # Borrow from gnomAD ld pruning
        if build == "GRCh37":
            pruned_mt = hl.read_matrix_table(
                qc_mt_path("joint", ld_pruned=True))

        elif build == "GRCh38":
            pruned_mt = hl.read_matrix_table(qc.path)

        input_mt = input_mt.filter_rows(
            hl.is_defined(pruned_mt.index_rows(input_mt.row_key)))

    return input_mt
def ld_prune_filter(intersect_out, prune_out, overwrite: bool = False):
    mt = hl.read_matrix_table(intersect_out)
    print(mt.count())
    mt = hl.variant_qc(mt)
    mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001)
                             & (mt.variant_qc.AF[0] < 0.999))
    print(mt_filt.count())

    mt_intersect_prune = hl.ld_prune(mt_filt.GT, r2=0.8, bp_window_size=500000)
    mt_intersect_pruned = mt_filt.filter_rows(
        hl.is_defined(mt_intersect_prune[mt_filt.row_key]))
    mt_intersect_pruned.write(prune_out, overwrite)
Exemplo n.º 3
0
def pca_filter_mt(in_mt: hl.MatrixTable,
                  maf: float = 0.05,
                  hwe: float = 1e-3,
                  call_rate: float = 0.98,
                  ld_cor: float = 0.2,
                  ld_window: int = 250000):

    print("\nInitial number of SNPs before filtering: {}".format(
        in_mt.count_rows()))
    mt = hl.variant_qc(in_mt)
    print(f'\nFiltering out variants with MAF < {maf}')
    mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF))
    mt_filt = mt_filt.filter_rows(mt_filt.maf > maf)

    print(f'\nFiltering out variants with HWE < {hwe:1e}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe)

    print(f'\nFiltering out variants with Call Rate < {call_rate}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate)

    # no strand ambiguity
    print('\nFiltering out strand ambigous variants')
    mt_filt = mt_filt.filter_rows(
        ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1]))

    # MHC chr6:25-35Mb
    # chr8.inversion chr8:7-13Mb
    print(
        '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]'
    )
    intervals = ['chr6:25M-35M', 'chr8:7M-13M']
    mt_filt = hl.filter_intervals(mt_filt, [
        hl.parse_locus_interval(x, reference_genome='GRCh38')
        for x in intervals
    ],
                                  keep=False)

    # This step is expensive (on local machine)
    print(
        f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}'
    )
    mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window)
    mt_ld_pruned = mt_filt.filter_rows(
        hl.is_defined(mt_ld_prune[mt_filt.row_key]))
    print("\nNumber of SNPs after filtering: {}".format(
        mt_ld_pruned.count_rows()))

    return mt_ld_pruned
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')

    # filter to loci that are contained in both matrix tables after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT)
    hgdp_1kg_select = hgdp_1kg_select.select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])

    # choose variants based off of gnomAD v3 parameters
    hgdp1kg_tobwgs_joined = hl.variant_qc(hgdp1kg_tobwgs_joined)
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_rows(
        IB=hl.agg.inbreeding(hgdp1kg_tobwgs_joined.GT,
                             hgdp1kg_tobwgs_joined.variant_qc.AF[1]))
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows(
        (hl.len(hgdp1kg_tobwgs_joined.alleles) == 2)
        & (hgdp1kg_tobwgs_joined.locus.in_autosome())
        & (hgdp1kg_tobwgs_joined.variant_qc.AF[1] > 0.01)
        & (hgdp1kg_tobwgs_joined.variant_qc.call_rate > 0.99)
        & (hgdp1kg_tobwgs_joined.IB.f_stat > -0.25))

    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.cache()
    nrows = hgdp1kg_tobwgs_joined.count_rows()
    print(f'hgdp1kg_tobwgs_joined.count_rows() = {nrows}')
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.sample_rows(
        NUM_ROWS_BEFORE_LD_PRUNE / nrows, seed=12345)

    pruned_variant_table = hl.ld_prune(hgdp1kg_tobwgs_joined.GT,
                                       r2=0.1,
                                       bp_window_size=500000)
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows(
        hl.is_defined(pruned_variant_table[hgdp1kg_tobwgs_joined.row_key]))
    mt_path = f'{output}/tob_wgs_hgdp_1kg_filtered_variants.mt'
    hgdp1kg_tobwgs_joined.write(mt_path)
Exemplo n.º 5
0
def query(output):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt_path = f'{output}/filtered_mt.mt'
    mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    # reproduce gnomAD genotype filtering
    mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj)
    mt = hl.variant_qc(mt)
    # Filter to common and biallelic variants
    mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                        & (mt.variant_qc.AF[1] > 0.05))
    pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
    filtered_mt = mt.filter_rows(
        hl.is_defined(pruned_variant_table[mt.row_key]))
    # save filtered mt table
    filtered_mt.write(mt_path, overwrite=True)
Exemplo n.º 6
0
def ld_prune_filter(mt: hl.MatrixTable, mt_ld: str, overwrite: bool = False):
    """
    Runs variant QC and filters out rare variants, those with missingness, and LD prunes to independent variants
    :param mt: Matrix table to run variant QC on and filter variants from
    :param mt_ld: Path to write intermediate filtered mt
    :param overwrite: if True, overwrites existing data
    :return:
    """
    mt.describe()
    mt = hl.variant_qc(mt)
    # mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.01) & (mt.variant_qc.AF[0] < 0.99))
    mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.05)
                             & (mt.variant_qc.AF[0] < 0.95)
                             & (mt.variant_qc.call_rate > 0.999))

    # pruned = hl.ld_prune(mt_filt.GT, r2=0.2, bp_window_size=500000)
    pruned = hl.ld_prune(mt_filt.GT, r2=0.1, bp_window_size=500000)
    mt_filt = mt_filt.filter_rows(hl.is_defined(pruned[mt_filt.row_key]))
    mt_filt.write(mt_ld, overwrite)
Exemplo n.º 7
0
def ld_prune(mt, args):
    """
     LD prune a matrix table, for calculating kinship and principal components

    :param mt: matrix table to annotate, should already have related individuals removed.
    :param args: namespace object with threshold arguments
    :return: returns the ld pruned matrix table
    """

    pruned_variant_table = hl.ld_prune(mt.GT,
                                       r2=args.r2,
                                       bp_window_size=args.bp_window_size)
    mt_ldpruned = mt.filter_rows(
        hl.is_defined(pruned_variant_table[mt.row_key]))

    logging.info(
        f"Variant and sample count after LD pruning: {mt_ldpruned.count()}")

    mt_ldpruned = mt_ldpruned.annotate_globals(
        ld_pruning_parameters={
            'r2': args.r2,
            'bp_window_size': args.bp_window_size
        })
    return mt_ldpruned
Exemplo n.º 8
0
def ld_prune_profile_25(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    hl.ld_prune(mt.GT)._force_count()
def get_qc_mt(
    mt: hl.MatrixTable,
    adj_only: bool = True,
    min_af: Optional[float] = 0.001,
    min_callrate: Optional[float] = 0.99,
    min_inbreeding_coeff_threshold: Optional[float] = -0.8,
    min_hardy_weinberg_threshold: Optional[float] = 1e-8,
    apply_hard_filters: bool = True,
    ld_r2: Optional[float] = 0.1,
    filter_lcr: bool = True,
    filter_decoy: bool = True,
    filter_segdup: bool = True,
    filter_exome_low_coverage_regions: bool = False,
    high_conf_regions: Optional[List[str]] = None,
) -> hl.MatrixTable:
    """
    Creates a QC-ready MT by keeping:

    - Variants outside known problematic regions
    - Bi-allelic SNVs only
    - Variants passing hard thresholds
    - Variants passing the set call rate and MAF thresholds
    - Genotypes passing on gnomAD ADJ criteria (GQ>=20, DP>=10, AB>0.2 for hets)

    In addition, the MT will be LD-pruned if `ld_r2` is set.

    :param mt: Input MT
    :param adj_only: If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation.
    :param min_af: Minimum allele frequency to keep. Not applied if set to ``None``.
    :param min_callrate: Minimum call rate to keep. Not applied if set to ``None``.
    :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``.
    :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``.
    :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30
    :param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning)
    :param filter_lcr: Filter LCR regions
    :param filter_decoy: Filter decoy regions
    :param filter_segdup: Filter segmental duplication regions
    :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept)
    :param high_conf_regions: If given, the data will be filtered to only include variants in those regions
    :return: Filtered MT
    """
    logger.info("Creating QC MatrixTable")
    if ld_r2 is not None:
        logger.warning(
            "The LD-prune step of this function requires non-preemptible workers only!"
        )

# qc_mt = filter_low_conf_regions(
#     mt,
#     filter_lcr=filter_lcr,
#     filter_decoy=filter_decoy,
#     filter_segdup=filter_segdup,
#     filter_exome_low_coverage_regions=filter_exome_low_coverage_regions,
#     high_conf_regions=high_conf_regions,
# )

# if adj_only:
#    qc_mt = filter_to_adj(
#        qc_mt
#    )  # TODO: Make sure that this works fine before call rate filtering

    qc_mt = filter_rows_for_qc(
        mt,
        min_af,
        min_callrate,
        min_inbreeding_coeff_threshold,
        min_hardy_weinberg_threshold,
        apply_hard_filters,
    )

    if ld_r2 is not None:
        qc_mt = qc_mt.persist()
        unfiltered_qc_mt = qc_mt.unfilter_entries()
        pruned_ht = hl.ld_prune(unfiltered_qc_mt.GT, r2=ld_r2)
        qc_mt = qc_mt.filter_rows(hl.is_defined(pruned_ht[qc_mt.row_key]))

    qc_mt = qc_mt.annotate_globals(qc_mt_params=hl.struct(
        adj_only=adj_only,
        min_af=min_af if min_af is not None else hl.null(hl.tfloat32),
        min_callrate=min_callrate if min_callrate is not None else hl.
        null(hl.tfloat32),
        inbreeding_coeff_threshold=min_inbreeding_coeff_threshold if
        min_inbreeding_coeff_threshold is not None else hl.null(hl.tfloat32),
        min_hardy_weinberg_threshold=min_hardy_weinberg_threshold
        if min_hardy_weinberg_threshold is not None else hl.null(hl.tfloat32),
        apply_hard_filters=apply_hard_filters,
        ld_r2=ld_r2 if ld_r2 is not None else hl.null(hl.tfloat32),
        filter_exome_low_coverage_regions=filter_exome_low_coverage_regions,
        high_conf_regions=high_conf_regions
        if high_conf_regions is not None else hl.null(hl.tarray(hl.tstr)),
    ))
    return qc_mt.annotate_cols(
        sample_callrate=hl.agg.fraction(hl.is_defined(qc_mt.GT)))
    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    bed_to_exclude_pca = hl.import_bed(
        f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38')
    project_mt = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt"
    )

    mt_vqc_filtered = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/1000g_chr1_20_snps_filtered.mt")
    # ld pruning
    logger.info("ld pruning and writing to disk")
    #pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.1)

    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))

    pruned_mt = pruned_mt.filter_rows(hl.is_defined(
        bed_to_exclude_pca[pruned_mt.locus]),
                                      keep=False)

    pruned_mt.write(
        f"{tmp_dir}/ddd-elgh-ukbb/1000g_chr1_20_snps_filtered_ldpruned.mt",
        overwrite=True)
    # run pca
    logger.info("run pca")
    pca_evals, pca_scores, loadings_ht = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
Exemplo n.º 11
0
# filter 5% AF
onekg = hl.variant_qc(onekg)
onekg = onekg.filter_rows(onekg.variant_qc.AF > 0.05, keep=True)

# unphase
onekg2 = onekg.annotate_entries(
    GT=hl.call(onekg.GT[0], onekg.GT[1], phased=False))
print(onekg2.GT.phased.show())

onekg = onekg2
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ld prune
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

onekg = hl.ld_prune(onekg, n_cores=800, r2=0.2)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write vds
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

onekg.write(onekg_ldpruned_file, overwrite=True)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write plink
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('export plink')
hl.export_plink(onekg, onekg_plink_prefix, fam_id=onekg.s, id=onekg.s)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# pca
Exemplo n.º 12
0
possibleGT

#Show unique possible calls occuring in entire dataset
unique_allelecalls = mt.aggregate_rows(
    hl.struct(ref=hl.agg.collect_as_set(mt.alleles[0]),
              alt=hl.agg.collect_as_set(mt.alleles[1])))
pprint(unique_allelecalls)

######## 2. QUALITY CONTROL VARIANTS
######## 2.1 Optional: Pruning in Linkage disequilibrium

# Function works only on biallelic data
biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2)
# Prune with window size of 44 kb for Caucasian and 22 kb for African
pruned_t = hl.ld_prune(mt.GT,
                       r2=0.8,
                       bp_window_size=44000,
                       memory_per_core=128)
mt = mt.filter_rows(hl.is_defined(pruned_t[mt.row_key]))

######### 2.2 Hardy-weinberg equilibrium
# HWE separately for each ethnic group, on only controls.
mt_NHW = mt.filter_cols(mt.Race == "Caucasian")
mt_NHW = mt_NHW.annotate_rows(hwe_ctrl=hl.agg.filter(
    mt_NHW.Affection == 'Control', hl.agg.hardy_weinberg_test(mt_NHW.GT)))
mt_NHW = mt_NHW.filter_rows(mt_NHW.hwe_ctrl.p_value > 10**-5)
mt_AA = mt.filter_cols(mt.Race == "African American")
mt_AA = mt_AA.annotate_rows(hwe_ctrl=hl.agg.filter(
    mt_AA.Affection == 'Control', hl.agg.hardy_weinberg_test(mt_AA.GT)))
mt_AA = mt_AA.filter_rows(mt_AA.hwe_ctrl.p_value > 10**-5)
# Merge of NHW and AA by columns (samples)
mt = mt_AA.union_cols(mt_NHW)
Exemplo n.º 13
0
def main(args):
    hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/')

    if not args.load_joint_pruned_qc_mt:
        logger.info('Joining exomes and genomes...')
        exome_qc_mt = read_and_pre_process_data(
            qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters'))
        genome_qc_mt = read_and_pre_process_data(
            qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters'))

        joint_qc_mt = exome_qc_mt.union_cols(
            genome_qc_mt)  # NOTE: this is an inner join on rows
        joint_qc_mt = joint_qc_mt.filter_rows(
            (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99))
        joint_qc_mt.write(qc_mt_path('joint'), args.overwrite)

        logger.info('LD-pruning joint mt of exomes and genomes...')
        joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint'))
        variants, samples = joint_qc_mt.count()
        logger.info('Pruning {0} variants in {1} samples'.format(
            variants, samples))
        joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1)
        # Note writing the LD-pruned MT is probably overkill
        # vs using `filter_rows` to filter sites based on the LD-pruned HT.
        joint_qc_pruned_mt = joint_qc_mt.filter_rows(
            hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key]))
        joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True),
                                 args.overwrite)

    pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True))
    variants, samples = pruned_mt.count()
    logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format(
        samples, variants))

    if not args.skip_pc_relate:
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht',
            args.overwrite)

        logger.info('Running PC-Relate...')
        scores = hl.read_table(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht')
        # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes
        relatedness_ht = hl.pc_relate(
            pruned_mt.GT,
            min_individual_maf=0.05,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=0.05,
            statistics='kin2')
        relatedness_ht.write(relatedness_ht_path, args.overwrite)

    relatedness_ht = hl.read_table(relatedness_ht_path)

    if not args.skip_relatedness:
        infer_ped(GnomADRelatedData('exomes'))
        infer_ped(GnomADRelatedData('genomes'))

        logger.info('Making rank file...')
        rank_table = make_rank_file(rank_annotations_path('joint'))
        logger.info('Finished making rank file...')

        related_samples_to_drop_ranked = get_related_samples_to_drop(
            rank_table, relatedness_ht)
        related_samples_to_drop_ranked.write(
            qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht',
            args.overwrite)

    pca_mt, related_mt = split_mt_by_relatedness(pruned_mt)

    if not args.skip_pop_pca:
        variants, samples = pca_mt.count()
        logger.info('{} samples after removing relateds'.format(samples))
        # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below
        plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' +
                                        pca_mt.s.replace(" ", "_")).replace(
                                            "/", "_").key_cols_by('uid')
        hl.export_plink(plink_mt,
                        qc_temp_data_prefix('joint') + '.unrelated.plink',
                        fam_id=plink_mt.uid,
                        ind_id=plink_mt.uid)

        logger.info(
            'Computing population PCs and annotating with known population labels...'
        )
        pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
            pca_mt.GT, k=20, compute_loadings=True)
        pca_af_ht = pca_mt.annotate_rows(
            pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
        pca_loadings = pca_loadings.annotate(
            pca_af=pca_af_ht[pca_loadings.key].pca_af)
        pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite)
        pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite)

    pca_scores = hl.read_table(ancestry_pca_scores_ht_path())
    pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path())
    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    logger.info(
        'Projecting population PCs for {} related samples...'.format(samples))
    related_scores = pc_project(related_mt, pca_loadings)
    relateds = related_mt.cols()
    relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    logger.info('Assigning population annotations...')
    pop_colnames = ['related', 'known_pop', 'scores']
    pop_annots_ht = hl.import_table(known_population_annotations,
                                    impute=True).key_by('combined_sample')

    joint_ht = pca_mt.cols().union(relateds)
    joint_ht = joint_ht.annotate(
        known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' +
                                joint_ht.s.replace(' ', '_')].known_pop
    )  # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed
    joint_pca_ht = joint_ht.select(*pop_colnames)
    joint_pca_ht, joint_pca_fit = run_assign_population_pcs(
        joint_pca_ht,
        qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz',
        qc_temp_data_prefix('joint') + '.RF_fit.pkl',
        pcs=list(range(1, 7)))
    joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select(
        'pop', *pop_colnames)

    # Add special Estonian pop category for genomes
    estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate(
        data_type='genomes').key_by('data_type', 'sample'))
    joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch)
    joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when(
        hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1'
    ).when(hl.is_defined(joint_ht.pop)
           & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist()

    # These are keyed by only `s`
    genome_mt = get_gnomad_data('genomes',
                                adj=False,
                                split=False,
                                meta_root=None).select_cols()
    exome_mt = get_gnomad_data('exomes',
                               adj=False,
                               split=False,
                               meta_root=None).select_cols()

    # Population-specific filtering
    if not args.skip_calculate_sample_metrics:
        logger.info(
            'Running mini sample QC for platform- and population-specific filtering...'
        )
        gnomad_sample_qc(exome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite)
        gnomad_sample_qc(genome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite)
        # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet

    logger.info('Annotating population and platform assignments...')
    platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms'))
    exome_ht = exome_mt.cols()
    exome_ht = exome_ht.annotate(
        qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s])

    genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters'))
    genome_ht = genome_mt.cols()
    genome_ht = genome_ht.annotate(
        qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s])

    exome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('exomes') + '.sample_qc.ht')
    genome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('genomes') + '.sample_qc.ht')

    exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s])
    genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s])

    # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev
    logger.info(
        'Calculating platform- and population-specific sample QC thresholds...'
    )
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])
    exome_ht = exome_ht.annotate_globals(
        hl.eval(exome_pop_platform_filter_ht.globals))
    exome_ht = exome_ht.annotate(
        **exome_pop_platform_filter_ht[exome_ht.key]).persist()

    genome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    genome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform'])
    genome_ht = genome_ht.annotate_globals(
        hl.eval(genome_pop_platform_filter_ht.globals))
    genome_ht = genome_ht.annotate(
        **genome_pop_platform_filter_ht[genome_ht.key]).persist()

    # Annotate samples that fail their respective filters
    checkpoint = exome_ht.aggregate(
        hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} exome samples found passing pop/platform-specific filtering'
    )
    exome_ht.key_by(data_type='exomes',
                    s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'),
                                        args.overwrite)

    checkpoint = genome_ht.aggregate(
        hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} genome samples found passing pop/platform-specific filtering'
    )
    genome_ht.key_by(data_type='genomes', s=genome_ht.s).write(
        qc_ht_path('genomes', 'pop_platform'), args.overwrite)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:

    mt_1kg_chr1_chr20 = hl.read_matrix_table(
        f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt"
    )
    #mt_vqc_filtered1 = mt_vqc_filtered.key_rows_by("locus")
    mt_1kg_chr1_chr20 = mt_1kg_chr1_chr20.key_rows_by("locus")
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(
        hl.is_defined(mt_1kg_chr1_chr20.rows()[mt_vqc_filtered.locus]))
    logger.info("done filtering writing mt")
    # ld pruning
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    #pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))
    # remove pruned areas that need to be removed

    # autosomes only:
    pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome())

    pruned_mt.write(f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt",
                    overwrite=True)
    # pruned_mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")

    # related_samples_to_drop = hl.read_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")
Exemplo n.º 15
0
    ```

    The last step is to filter the matrix table again by the pruned variants list. For this, <font color='red'>is_defined</font> is useful:

    ```python
    mt = mt.filter_rows(hl.is_defined(pruned_variants[mt.row_key]))
    ```

    Be sure to take a look at how pruning changes the number of variants in your dataset using the <font color='red'>count</font> function.
    """

with herzog.Cell("python"):
    #We added code to help you monitor the time it takes for pruning. We currently estimate over an hour.
    start_prune_write_time = time.time()
    pruned_variant_table = hl.ld_prune(mt.GT,
                                       r2=0.2,
                                       bp_window_size=500000,
                                       block_size=1024)
    elapsed_prune_write_time = time.time() - start_prune_write_time
    print(timedelta(seconds=elapsed_prune_write_time))

with herzog.Cell("python"):
    mt = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))

with herzog.Cell("markdown"):
    """
    ## Principal Component Analysis

    In this next section, we'll cover a method for easily visualizing and adjusting for population structure in an association analysis: Principal Component Analysis (PCA).

    You run PCA using the function <font color='red'>hwe_normalized_pca</font>. For this analysis, we are mainly interested in the scores, and can disregard the eigenvalues and loadings. The `k` parameter determines the number of PCs to return -- as `k` grows, so does the computation time.
Exemplo n.º 16
0
def determine_pca_variants(
    autosomes_only: bool = True,
    snv_only: bool = True,
    bi_allelic_only: bool = False,
    adj_only: bool = True,
    min_gnomad_v3_ac: Optional[int] = None,
    high_qual_ccdg_exome_interval_only: bool = False,
    high_qual_ukbb_exome_interval_only: bool = False,
    pct_samples_ukbb_exome_interval: float = 0.8,
    min_joint_af: float = 0.0001,  # TODO: Konrad mentioned that he might want to lower this
    min_joint_callrate: float = 0.95,
    min_inbreeding_coeff_threshold: Optional[float] = -0.8,
    min_hardy_weinberg_threshold: Optional[float] = 1e-8,
    min_ccdg_exome_callrate: float = 0.99,  # TODO: What parameter should this start with?
    min_ukbb_exome_callrate: float = 0.99,  # TODO: What parameter should this start with?
    filter_lcr: bool = True,
    filter_segdup: bool = True,
    ld_pruning: bool = True,
    ld_pruning_dataset: str = "ccdg_genomes",
    ld_r2: float = 0.1,
    read_per_dataset_checkpoint_if_exists: bool = False,
    read_pre_ld_prune_ht_checkpoint_if_exists: bool = False,
    read_pre_ld_prune_mt_checkpoint_if_exists: bool = False,
    overwrite: bool = True,
    filter_washu: bool = False,
) -> None:
    """
    Determine a diverse set of variants for relatedness/ancestry PCA using CCDG, gnomAD v3, and UK Biobank.

    :param autosomes_only: Whether to filter to variants in autosomes
    :param snv_only: Whether to filter to SNVs
    :param bi_allelic_only: Whether to filter to variants that are bi-allelic in either CCDG and gnomAD v3
    :param adj_only: If set, only ADJ genotypes (QD >= 2, FS <= 60 and MQ >= 30) are kept. This filter is applied before the call rate and AF calculation
    :param min_gnomad_v3_ac: Optional lower bound of AC for variants in gnomAD v3 genomes
    :param high_qual_ccdg_exome_interval_only: Whether to filter to high quality intervals in CCDG exomes
    :param float pct_samples_ukbb_exome_interval: Percent of samples with over 80% of bases having coverage of over 20x per interval
    :param high_qual_ukbb_exome_interval_only: Whether to filter to high quality intervals in UKBB 455K exomes
    :param float pct_samples_ukbb: Percent of samples with coverage greater than 20x over the interval for filtering
    :param min_joint_af: Lower bound for combined MAF computed from CCDG and gnomAD v3 genomes
    :param min_joint_callrate: Lower bound for combined callrate computed from CCDG and gnomAD v3 genomes
    :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to `None`
    :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to `None`
    :param min_ccdg_exome_callrate: Lower bound for CCDG exomes callrate
    :param min_ukbb_exome_callrate: Lower bound for UKBB exomes callrate
    :param filter_lcr: Whether to filter LCR regions
    :param filter_segdup: Whether to filter Segdup regions
    :param ld_pruning: Whether to conduct LD pruning
    :param ld_pruning_dataset: Which dataset is used for LD pruning, 'ccdg_genomes' or 'gnomAD_genomes'
    :param ld_r2: LD pruning cutoff
    :param read_per_dataset_checkpoint_if_exists: Whether to read the CCDG exome/genome pre filtered HT if it exists.
        Each dataset possible filtered to: autosomes only, SNVs only, gnomAD v3.1.2 AC filter, CCDG high quality exome
        intervals, and UK Biobank high quality exome intervals
    :param read_pre_ld_prune_ht_checkpoint_if_exists: Whether to read in the PCA variant HT with no LD-pruning if it exists
    :param read_pre_ld_prune_mt_checkpoint_if_exists: Whether to read in the checkpointed MT filtered to variants in the
        PCA variant HT with no LD-pruning if it exists
    :param overwrite: Whether to overwrite the final variant HT
    :param filter_washu: Whether to filter out washU samples
    :return: Table with desired variants for PCA
    """
    if not read_pre_ld_prune_ht_checkpoint_if_exists:
        logger.info(
            "Loading gnomAD v3.1.2 release HT and UK Biobank 455K release HT ..."
        )
        flag = "_without_washu" if filter_washu else ""
        gnomad_ht = gnomad_public_release("genomes").ht()
        gnomad_ht = gnomad_ht.select(
            gnomad_was_split=gnomad_ht.was_split,
            gnomad_AC=gnomad_ht.freq[0].AC,
            gnomad_AN=gnomad_ht.freq[0].AN,
            gnomad_genomes_site_inbreeding_coeff=gnomad_ht.info.InbreedingCoeff,
            gnomad_genomes_homozygote_count=gnomad_ht.freq[0].homozygote_count,
        )
        if min_hardy_weinberg_threshold is not None:
            gnomad_ht = gnomad_ht.annotate(
                gnomad_genomes_hwe=hl.hardy_weinberg_test(
                    hl.int32(
                        (gnomad_ht.gnomad_AN / 2)
                        - gnomad_ht.gnomad_genomes_homozygote_count
                        - (
                            gnomad_ht.gnomad_AC
                            - (gnomad_ht.gnomad_genomes_homozygote_count * 2)
                        )
                    ),  # Num hom ref genotypes
                    hl.int32(
                        (
                            gnomad_ht.gnomad_AC
                            - (gnomad_ht.gnomad_genomes_homozygote_count * 2)
                        )
                    ),  # Num het genotypes
                    gnomad_ht.gnomad_genomes_homozygote_count,  # Num hom alt genotypes
                ),
            )

        ukbb_ht = hl.read_table(ukbb_release_ht_path("broad", 7))
        ukbb_ht = ukbb_ht.select(
            ukbb_AC=ukbb_ht.freq[0].AC,
            ukbb_AN=ukbb_ht.freq[0].AN,
        )
        ukbb_meta_ht = hl.read_table(ukbb_meta_ht_path("broad", 7))

        # Only count samples used in the UK Biobank exome frequency calculations
        ukbb_exome_count = ukbb_meta_ht.filter(
            ukbb_meta_ht.sample_filters.high_quality
            & hl.is_defined(ukbb_meta_ht.ukbb_meta.batch)
            & ~ukbb_meta_ht.sample_filters.related
        ).count()

        logger.info("Getting CCDG genome and exome sample counts...")
        ccdg_genome_count = get_ccdg_vds(
            "genomes", filter_washu=filter_washu
        ).variant_data.count_cols()
        logger.info(f"Number of CCDG genome samples: {ccdg_genome_count}...")
        ccdg_exome_count = get_ccdg_vds("exomes").variant_data.count_cols()
        logger.info(f"Number of CCDG exome samples: {ccdg_exome_count} ...")

        def _initial_filter(data_type):
            """
            Get Table of CCDG variants passing desired filters.

            Possible filters are:
                - Autosomes only
                - SNVs only
                - gnomAD v3.1.2 AC filter
                - CCDG high quality exome intervals
                - UK Biobank high quality exome intervals

            After densification of the VDS, rows are annotated with:
                - ccdg_{data_type}_was_split
                - ccdg_{data_type}_AC
                - ccdg_{data_type}_AN

            The filtered and annotated rows are returned as a Table and are also checkpointed
            :param data_type: Whether data is from genomes or exomes

            :return: Table of CCDG filtered variants
            """
            logger.info(
                "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...",
                data_type,
            )
            vds = get_ccdg_vds(data_type, filter_washu=filter_washu)
            logger.info(
                f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..."
            )
            vds = hl.vds.split_multi(vds)

            if autosomes_only:
                logger.info("Filtering CCDG %s VDS to autosomes...", data_type)
                vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True)

            ht = vds.variant_data.rows()
            variant_filter_expr = True
            if snv_only:
                logger.info("Filtering CCDG %s VDS to SNVs...", data_type)
                variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1])

            if min_gnomad_v3_ac:
                logger.info(
                    "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...",
                    data_type,
                    min_gnomad_v3_ac,
                )
                variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac

            vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True)

            if high_qual_ccdg_exome_interval_only:
                logger.info(
                    f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...",
                    data_type,
                    INTERVAL_DP,
                )
                interval_qc_ht = hl.read_table(
                    get_ccdg_results_path(
                        data_type="exomes", result=f"intervals_{INTERVAL_DP}x"
                    )
                )
                interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep)
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            if high_qual_ukbb_exome_interval_only:
                if not autosomes_only:
                    raise ValueError(
                        "UK Biobank interval QC filtering is only available for autosomes!"
                    )

                logger.info(
                    "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...",
                    data_type,
                )
                interval_qc_ht = hl.read_table(
                    ukbb_interval_qc_path("broad", 7, "autosomes")
                )  # Note: freeze 7 is all included in gnomAD v4
                interval_qc_ht = interval_qc_ht.filter(
                    interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval
                )
                vds = hl.vds.filter_intervals(
                    vds, intervals=interval_qc_ht.interval.collect(), keep=True
                )

            logger.info("Densifying filtered CCDG %s VDS...", data_type)
            mt = hl.vds.to_dense_mt(vds)
            if adj_only:
                mt = filter_to_adj(mt)

            annotation_expr = {
                f"ccdg_{data_type}_was_split": mt.was_split,
                f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()),
                f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2,
            }

            if min_inbreeding_coeff_threshold is not None:
                annotation_expr[
                    f"ccdg_{data_type}_site_inbreeding_coeff"
                ] = bi_allelic_site_inbreeding_expr(mt.GT)
            if min_hardy_weinberg_threshold is not None:
                annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test(
                    mt.GT
                )

            mt = mt.annotate_rows(**annotation_expr)
            ht = mt.rows().checkpoint(
                get_ccdg_results_path(
                    data_type=data_type,
                    mt=False,
                    result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}",
                ),
                overwrite=(not read_per_dataset_checkpoint_if_exists),
                _read_if_exists=read_per_dataset_checkpoint_if_exists,
            )

            return ht

        logger.info(
            "Creating Table with joint gnomAD v3.1.2 and CCDG genome allele frequencies and callrate...",
        )
        ccdg_genomes_ht = _initial_filter("genomes")
        ccdg_exomes_ht = _initial_filter("exomes")
        ht = ccdg_exomes_ht.join(ccdg_genomes_ht, how="inner")
        ht = ht.annotate(**gnomad_ht[ht.key], **ukbb_ht[ht.key])
        ht = ht.annotate(
            joint_biallelic=(~ht.ccdg_genomes_was_split) | (~ht.gnomad_was_split),
            joint_AC=ht.ccdg_genomes_AC + ht.gnomad_AC,
            joint_AN=ht.ccdg_genomes_AN + ht.gnomad_AN,
        )
        total_genome_an = hl.eval(
            (gnomad_ht.freq_sample_count[0] + ccdg_genome_count) * 2
        )
        ht = ht.annotate(
            joint_AF=ht.joint_AC / ht.joint_AN,
            joint_callrate=ht.joint_AN / total_genome_an,
        )
        ht = ht.checkpoint(
            f"{get_joint_pca_variants_ht_path(filter_washu=filter_washu)}",
            overwrite=(not read_pre_ld_prune_ht_checkpoint_if_exists),
            _read_if_exists=read_pre_ld_prune_ht_checkpoint_if_exists,
        )

        logger.info(
            "Filtering variants to combined gnomAD v3.1.2 and CCDG genome AF of %.3f and callrate of %.2f, CCDG exome callrate "
            "of %.2f, and UK Biobank exome callrate of %.2f....",
            min_joint_af,
            min_joint_callrate,
            min_ccdg_exome_callrate,
            min_ukbb_exome_callrate,
        )

        variant_filter_expr = True
        if bi_allelic_only:
            variant_filter_expr &= ht.joint_biallelic
        if min_inbreeding_coeff_threshold is not None:
            variant_filter_expr &= (
                ht.ccdg_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold
            ) & (
                ht.gnomad_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold
            )
        if min_hardy_weinberg_threshold is not None:
            variant_filter_expr &= (
                ht.ccdg_genomes_hwe.p_value > min_hardy_weinberg_threshold
            ) & (ht.gnomad_genomes_hwe.p_value > min_hardy_weinberg_threshold)

        variant_filter_expr &= (
            (ht.joint_AF > min_joint_af)
            & (ht.joint_callrate > min_joint_callrate)
            & (ht.ccdg_exomes_AN / (ccdg_exome_count * 2) > min_ccdg_exome_callrate)
            & (ht.ukbb_AN / (ukbb_exome_count * 2) > min_ukbb_exome_callrate)
        )

        ht = ht.filter(variant_filter_expr)

        ht = ht.annotate_globals(
            autosomes_only=autosomes_only,
            snv_only=snv_only,
            adj_only=adj_only,
            bi_allelic_only=bi_allelic_only,
            min_gnomad_v3_ac=min_gnomad_v3_ac,
            high_qual_ccdg_exome_interval_only=high_qual_ccdg_exome_interval_only,
            high_qual_ukbb_exome_interval_only=high_qual_ukbb_exome_interval_only,
            filter_lcr=filter_lcr,
            filter_segdup=filter_segdup,
            min_af=min_joint_af,
            min_callrate=min_joint_callrate,
            min_ccdg_exome_callrate=min_ccdg_exome_callrate,
            min_ukbb_exome_callrate=min_ukbb_exome_callrate,
            min_inbreeding_coeff_threshold=min_inbreeding_coeff_threshold,
            min_hardy_weinberg_threshold=min_hardy_weinberg_threshold,
        )

        ht = filter_low_conf_regions(
            ht,
            filter_lcr=filter_lcr,
            filter_decoy=False,  # No decoy for GRCh38
            filter_segdup=filter_segdup,
        )

        ht = ht.checkpoint(
            get_pca_variants_path(ld_pruned=False, filter_washu=filter_washu),
            overwrite=True,
        )
    else:
        ht = hl.read_table(
            get_pca_variants_path(
                ld_pruned=False, data=ld_pruning_dataset, filter_washu=filter_washu
            )
        )

    if ld_pruning:
        # Whether this is still required?
        logger.warning(
            "The LD-prune step of this function requires non-preemptible workers only!"
        )
        logger.info("Creating Table after LD pruning of %s...", ld_pruning_dataset)
        if ld_pruning_dataset == "ccdg_genomes":
            vds = get_ccdg_vds("genomes")
            vds = hl.vds.split_multi(vds, filter_changed_loci=True)
            vds = hl.vds.filter_variants(vds, ht, keep=True)
            mt = hl.vds.to_dense_mt(vds)
        elif ld_pruning_dataset == "gnomad_genomes":
            mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
            logger.info("Converting gnomAD v3.1 MatrixTable to VDS...")
            mt = mt.select_entries(
                "END", "LA", "LGT", adj=get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD)
            )
            vds = hl.vds.VariantDataset.from_merged_representation(mt)

            logger.info("Performing split-multi and filtering variants...")
            vds = hl.vds.split_multi(vds, filter_changed_loci=True)
            vds = hl.vds.filter_variants(vds, ht)

            logger.info("Densifying data...")
            mt = hl.vds.to_dense_mt(vds)
        else:
            ValueError(
                "Only options for LD pruning are `ccdg_genomes` and `gnomad_genomes`"
            )

        hl._set_flags(no_whole_stage_codegen="1")
        mt = mt.checkpoint(
            get_pca_variants_path(ld_pruned=False, data=ld_pruning_dataset, mt=True),
            overwrite=(not read_pre_ld_prune_mt_checkpoint_if_exists),
            _read_if_exists=read_pre_ld_prune_mt_checkpoint_if_exists,
        )
        hl._set_flags(no_whole_stage_codegen=None)
        ht = hl.ld_prune(mt.GT, r2=ld_r2)
        ht = ht.annotate_globals(ld_r2=ld_r2, ld_pruning_dataset=ld_pruning_dataset)
        ht = ht.checkpoint(
            get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset),
            overwrite=overwrite,
            _read_if_exists=(not overwrite),
        )
        mt = mt.filter_rows(hl.is_defined(ht[mt.row_key]))
        mt.naive_coalesce(1000).write(
            get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset, mt=True),
            overwrite=overwrite,
        )
Exemplo n.º 17
0
vds = vds.filter_entries(
    ((vds.locus.contig != "chrX") | (vds.locus.contig != "chrY")) &
    (((vds.AD[0] + vds.AD[1]) / vds.DP < 0.9) |
     (vds.GT.is_hom_ref() & ((vds.AD[0] / vds.DP < 0.9) | (vds.GQ < 20))) |
     (vds.GT.is_het() & ((vds.AD[1] / vds.DP < 0.20) | (vds.PL[0] < 20))) |
     (vds.GT.is_hom_var() & ((vds.AD[1] / vds.DP < 0.9) |
                             (vds.PL[0] < 20))) | (vds.DP > 200)),
    keep=False)
vds = hl.variant_qc(vds)

vds = vds.filter_rows(
    (vds.locus.contig == "chrX") | (vds.locus.contig == "chrY") |
    ((vds.info.QD > 4) & (vds.variant_qc.callRate > 0.99) &
     (vds.variant_qc.dpMean > 20) & (vds.variant_qc.AF > 0.05) &
     (vds.filters.size() == 0) & (vds.variant_qc.AF < 0.95)),
    keep=True)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# impute sex
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vds = vds.filter_rows(hl.is_defined(par[vds.locus]), keep=False)
ct = hl.impute_sex(vds.GT, female_threshold=0.6, male_threshold=0.7)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# LD prune
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vds_ldpruned = hl.ld_prune(vds, 40)
Exemplo n.º 18
0
def main(args):
    hl.init(log='/subpops.log')

    if args.population == 'all':
        pcs = list(range(1, 7))
    elif args.population == 'eur':
        pcs = [1, 2, 3]
    elif args.population == 'eas':
        pcs = [1, 2]
    else:
        pcs = [1, 2]

    if not args.skip_filtering:
        pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True))
        exome_project_table = hl.read_table(
            qc_ht_path('exomes', 'hard_filters')).select('project_id')
        exome_platform_table = hl.read_table(qc_ht_path(
            'exomes', 'platforms')).select('qc_platform')
        exome_table = exome_project_table.annotate(qc_platform=hl.str(
            exome_platform_table[exome_project_table.key].qc_platform))
        genome_table = hl.read_table(qc_ht_path(
            'genomes', 'hard_filters')).select('project_id', 'qc_platform')
        joint_table = exome_table.union(genome_table)
        exome_pop_table = hl.read_table(qc_ht_path(
            'exomes', 'pop_platform')).select('pop')
        genome_pop_table = hl.read_table(qc_ht_path(
            'genomes', 'pop_platform')).select('pop')
        pop_table = exome_pop_table.union(genome_pop_table)
        pop_table = pop_table.annotate(
            project_id=joint_table[pop_table.key].project_id,
            qc_platform=joint_table[pop_table.key].qc_platform)
        pruned_mt = pruned_mt.annotate_cols(meta=pop_table[pruned_mt.col_key])
        variants, samples = pruned_mt.count()
        logger.info(
            f'{samples} samples, {variants} variants found in original joint MT'
        )

        if args.population == 'all':
            sample_criteria = True
        elif args.population == 'eur':
            sample_criteria = (pruned_mt.meta.pop
                               == "nfe") | (pruned_mt.meta.pop == "fin")
        elif args.population == 'eas':
            sample_criteria = (pruned_mt.meta.pop
                               == "eas") & (pruned_mt.data_type == "exomes")
        else:
            sample_criteria = pruned_mt.meta.pop == args.population

        pruned_mt = pruned_mt.filter_cols(sample_criteria)
        variants, samples = pruned_mt.count()
        logger.info(
            f'{samples} samples, {variants} variants found in {args.population} in joint MT'
        )

        pca_mt, related_mt = split_mt_by_relatedness(pruned_mt)

        # Filter variants by callrate on each platform
        pca_platforms_mt = pca_mt.group_cols_by(
            pca_mt.meta.qc_platform).aggregate(missing=hl.agg.count_where(
                hl.is_missing(pca_mt.GT)),
                                               total=hl.agg.count())
        # All variants must have a callrate at least .999 in each platform, or no more than 1 missing sample if platform <= 1000 samples
        pca_platforms_mt = pca_platforms_mt.annotate_entries(
            remove_variant=(hl.case().when(
                pca_platforms_mt.total > 1000, pca_platforms_mt.missing /
                pca_platforms_mt.total > 0.001).default(
                    pca_platforms_mt.missing > 1)))
        pca_platforms_mt = pca_platforms_mt.filter_rows(hl.agg.any(
            pca_platforms_mt.remove_variant),
                                                        keep=False)
        pca_mt = pca_mt.filter_rows(
            (hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2 > 0.001)
            & hl.is_defined(pca_platforms_mt.rows()[pca_mt.row_key]))
        variants, samples = pca_mt.count()
        logger.info(
            f'{samples} samples, {variants} variants found in {args.population} in PCA MT after filtering variants by AF and platform callrate'
        )

        pca_pruned = hl.ld_prune(pca_mt.GT, r2=0.1)
        pca_mt = pca_mt.filter_rows(hl.is_defined(pca_pruned[pca_mt.row_key]))
        related_mt = related_mt.filter_rows(
            hl.is_defined(pca_mt.rows()[related_mt.row_key]))
        pca_mt.write(
            f"{qc_temp_data_prefix('joint')}.{args.population}.unrelated.filtered.mt",
            args.overwrite)
        related_mt.write(
            f"{qc_temp_data_prefix('joint')}.{args.population}.related.filtered.mt",
            args.overwrite)

    pca_mt = hl.read_matrix_table(
        f"{qc_temp_data_prefix('joint')}.{args.population}.unrelated.filtered.mt"
    )
    related_mt = hl.read_matrix_table(
        f"{qc_temp_data_prefix('joint')}.{args.population}.related.filtered.mt"
    )

    variants, samples = pca_mt.count()
    logger.info(
        f'{samples} samples after removing relateds, {variants} variants after filtering and LD pruning'
    )

    if not args.skip_pop_pca:
        pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
            pca_mt.GT, k=10, compute_loadings=True)
        pca_mt = pca_mt.annotate_rows(
            pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2)
        pca_loadings = pca_loadings.annotate(
            pca_af=pca_mt.rows()[pca_loadings.key].pca_af)
        pca_scores.write(ancestry_pca_scores_ht_path(args.population),
                         args.overwrite)
        pca_loadings.write(ancestry_pca_loadings_ht_path(args.population),
                           args.overwrite)

    pca_scores = hl.read_table(ancestry_pca_scores_ht_path(args.population))
    pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path(
        args.population))
    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    logger.info(f'Projecting population PCs for {samples} related samples...')
    related_ht = pc_project(related_mt, pca_loadings)
    related_mt = related_mt.annotate_cols(
        scores=related_ht[related_mt.col_key].scores)

    logger.info('Assigning population annotations...')
    pop_colnames = ['related', 'known_pop', 'scores']
    # Join MTs, then annotate with known pops, then select out columns we care about, then assign pop pcs
    joint_ht = pca_mt.cols().union(related_mt.cols())
    joint_ht = get_known_populations(joint_ht, args.population)
    joint_ht = joint_ht.select(
        *pop_colnames, **{f"PC{i + 1}": joint_ht.scores[i]
                          for i in range(10)})
    joint_pca_ht, joint_pca_fit = run_assign_population_pcs(
        joint_ht,
        f'{qc_temp_data_prefix("joint")}.RF_pop_assignments.{args.population}.txt.bgz',
        f'{qc_temp_data_prefix("joint")}.RF_fit.{args.population}.pkl',
        pcs=pcs)
    joint_pca_ht = joint_pca_ht.annotate(
        pop=hl.cond(joint_pca_ht.pop == 'oth',
                    hl.literal(f'o{args.population[:2]}'), joint_pca_ht.pop))
    joint_ht = joint_ht.select(**{
        f"subpop_{args.population}_PC{i}": joint_ht[f"PC{i}"]
        for i in range(1, 11)
    },
                               subpop=joint_pca_ht[joint_ht.key].pop,
                               known_subpop=joint_pca_ht[
                                   joint_ht.key].known_pop)
    joint_ht.write(subpop_ht_path(args.population), args.overwrite)
Exemplo n.º 19
0
vdsx = vdsnopar.filter_rows((vdsnopar.locus.contig == "chrX")
                            & (vdsnopar.variant_qc.AF >= 0.05)
                            & (vdsnopar.variant_qc.AF <= 0.95))
ct = hl.impute_sex(vdsx.GT, female_threshold=0.6, male_threshold=0.7)
vdsct = vdsnopar.cols()
ct = ct.annotate(ydp=vdsct[ct.s].ydp)

(ct.select(ID=ct.s, sexFstat=ct.f_stat, isFemale=ct.is_female,
           ydp=ct.ydp).export(sample_sex_fstat_file))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ld pruning
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("LD pruning...")
vds5_ldp = hl.ld_prune(vds5, n_cores=1600, r2=0.1)
#vds5_ldp = hl.ld_prune(vds5, n_cores=60, r2=0.2, window=1000000, memory_per_core=512)

print("writing LD pruned VDS...")
vds5_ldp.write(vds_ldpruned_common_file, overwrite=True)
hl.export_plink(vds5_ldp,
                vds_ldpruned_common_plink,
                fam_id=vds5_ldp.s,
                id=vds5_ldp.s)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IBD analysis
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# use king until pcrelate works
def main(args):

    bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca,
                                       reference_genome='GRCh38')
    cohorts_pop = hl.import_table(cohorts_populations,
                                  delimiter="\t").key_by('s')

    # # overlap AKT dataset
    overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap)
    # drop cohorts
    # annotate with cohorts and populations from s3 table.
    # save matrixtable
    mt = hl.read_matrix_table(args.matrixtable)
    mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort)
    mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population)
    mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated)
    # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation)
    mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID)
    mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt",
        overwrite=True)
    # filter matrixtable
    logger.info("wrote mt ")
    # filter mt
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1]))
    mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1]))
    mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail')
    # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm.
    mt_vqc_filtered = mt_vqc.filter_rows(
        (mt_vqc.variant_QC_Hail.call_rate >= 0.99)
        & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05)
        & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95))
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined(
        bed_to_exclude_pca[mt_vqc_filtered.locus]),
                                                  keep=False)
    # overlap AKT dataset:
    # overlap_1kg_AKT
    # mt_1kg_chr1_chr20 = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt")
    overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus")
    mt_vqc_filtered = mt_vqc_filtered.filter_rows(
        hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus]))
    logger.info("done filtering writing mt")
    # ld pruning
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    #pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))
    # remove pruned areas that need to be removed

    # autosomes only:
    pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome())

    pruned_mt.write(
        f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt",
        overwrite=True)
    # pruned_mt = hl.read_matrix_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt")

    # related_samples_to_drop = hl.read_table(
    #    f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")

    logger.info("run_pca_with_relateds")
    # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds(
    #    pruned_mt, related_samples_to_drop, autosomes_only=True)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
    pca_scores = pca_scores.annotate(
        known_pop=pruned_mt.cols()[pca_scores.s].known_pop)
    # mt = mt.annotate_cols(
    #    loadings=pca_loadings[mt_vqc_filtered.col_key].loadings)
    # mt = mt.annotate_cols(known_pop="unk")
    # pca_scores = pca_scores.annotate(known_pop="unk")
    pca_scores.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht",
        overwrite=True)
    pca_loadings.write(
        f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht",
        overwrite=True)
    with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt",
              'w') as f:
        for val in pca_evals:
            f.write(str(val))

    logger.info("assign population pcs")

    pop_ht, pop_clf = assign_population_pcs(pca_scores,
                                            pca_scores.scores,
                                            known_col="known_pop",
                                            n_estimators=100,
                                            prop_train=0.8,
                                            min_prob=0.5)
    pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht",
                 overwrite=True)
    pop_ht.export(
        f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")
Exemplo n.º 21
0
def main(args):

    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=args.exome_cohort,
                           part='unphase_adj_genotypes',
                           split=True))

        # filter to samples passing QC filters
        logger.info(
            "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..."
        )
        sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc'))
        sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters))
        mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key])))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        maf = args.maf_threshold
        mt = (mt.filter_rows(
            bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1])
            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf)
            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce(
                500))

        logger.info("Checkpoint: writing filtered MT before LD pruning...")
        mt = mt.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='high_callrate_common_snp_biallelic'),
                           overwrite=args.overwrite)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key])))

        logger.info("Writing filtered MT with ld-pruned variants...")
        (mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                                 part='high_callrate_common_snp_biallelic',
                                 split=True,
                                 ld_pruned=True),
                  overwrite=args.overwrite))

    logger.info("Importing filtered ld-pruned MT...")
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA on {mt.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")

    # Annotate eigenvalues as global field
    pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues}))

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = args.output_ht
    pca_table = (pca_table.checkpoint(output=output_ht_path,
                                      overwrite=args.overwrite))

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("PCA pipeline finalised...")
def main(args):
    mt = hl.read_matrix_table(args.matrixtable)
    # ld pruning
    pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key]))
    pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True)

    # PC relate
    pruned_mt = pruned_mt.select_entries(
        GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

    eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                           k=10,
                                           compute_loadings=False)
    scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True)

    relatedness_ht = hl.pc_relate(pruned_mt.GT,
                                  min_individual_maf=0.05,
                                  scores_expr=scores[pruned_mt.col_key].scores,
                                  block_size=4096,
                                  min_kinship=0.05,
                                  statistics='kin2')
    relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht",
                         overwrite=True)
    pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                           pairs.j,
                                                           keep=False)
    related_samples_to_remove.write(
        f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True)

    pca_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                   keep=False)
    related_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                       keep=True)

    variants, samples = pca_mt.count()

    print(f"{samples} samples after relatedness step.")

    # Population pca

    plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid')
    hl.export_plink(plink_mt,
                    f"{args.output_dir}/mt_unrelated.plink",
                    fam_id=plink_mt.uid,
                    ind_id=plink_mt.uid)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pca_mt.GT, k=20, compute_loadings=True)
    pca_af_ht = pca_mt.annotate_rows(
        pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_af_ht[pca_loadings.key].pca_af)
    pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True)
    pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True)

    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    print(
        'Projecting population PCs for {} related samples...'.format(samples))
    #related_scores = pc_project(related_mt, pca_loadings)
    #relateds = related_mt.cols()
    #relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True)
    p = hl.plot.scatter(pca_mt.scores[0],
                        pca_mt.scores[1],
                        title='PCA',
                        xlabel='PC1',
                        ylabel='PC2')
    output_file(f"{args.plot_dir}/pca.html")
    save(p)
Exemplo n.º 23
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

        # Read MT from 1kgenome and keep only locus defined in interval
        mt_1kg = get_1kg_mt(args.default_reference)

        # Joining dataset (inner join). Keep only 'GT' entry field
        mt_joint = (mt.select_entries('GT').union_cols(
            mt_1kg.select_entries('GT'), row_join_type='inner'))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        mt_joint = (mt_joint.filter_rows(
            bi_allelic_expr(mt_joint)
            & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1])
            & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)).
                    naive_coalesce(1000))

        logger.info(
            "Checkpoint: writing joint filtered MT before LD pruning...")
        mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='joint_1kg_high_callrate_common_snp_biallelic'),
                                       overwrite=True)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt_joint.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt_joint = (mt_joint.filter_rows(
            hl.is_defined(pruned_variant_table[mt_joint.row_key])))

        logger.info("Writing filtered joint MT with variants in LD pruned...")
        (mt_joint.write(get_qc_mt_path(
            dataset=args.exome_cohort + '_1kg',
            part='joint_high_callrate_common_snp_biallelic',
            split=True,
            ld_pruned=True),
                        overwrite=args.overwrite))

    logger.info("Importing filtered joint MT...")
    mt_joint = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort + '_1kg',
                       part='joint_high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA with {mt_joint.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT,
                                                      k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")  # TODO: save eigenvalues?

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='joint_pca_1kg')
    pca_table.write(output=output_ht_path)

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("Done!")
Exemplo n.º 24
0
 def test_ld_prune(self):
     ds = hl.split_multi_hts(
         hl.import_vcf(resource('sample.vcf')))
     hl.ld_prune(ds, 8).count_rows()
Exemplo n.º 25
0
def main(args):

    # Init Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_compute_pc_relate:

        if not args.skip_filter_data:
            # Read MatrixTable
            mt = hl.read_matrix_table(args.mt_input_path)

            # filter variants (bi-allelic, high-callrate, common SNPs)
            logger.info(
                f"Filtering to bi-allelic, high-callrate, common SNPs ({args.maf_threshold}) for pc_relate..."
            )

            mt = (mt.filter_rows(
                (hl.len(mt.alleles) == 2)
                & hl.is_snp(mt.alleles[0], mt.alleles[1])
                & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > args.maf_threshold)
                & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)
                & ~mt.was_split).repartition(500, shuffle=False))

            # keep only GT entry field and force to evaluate expression
            (mt.select_entries(mt.GT).write(
                f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt',
                overwrite=args.overwrite))

        mt = hl.read_matrix_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt'
        )

        if not args.skip_prune_ld:
            # LD pruning
            # Avoid filtering / missingness entries (genotypes) before run LP pruning
            # Zulip Hail support issue -> "BlockMatrix trouble when running pc_relate"
            # mt = mt.unfilter_entries()

            # Prune variants in linkage disequilibrium.
            # Return a table with nearly uncorrelated variants

            logger.info(
                f'Pruning variants in LD from MT with {mt.count_rows()} variants...'
            )

            pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2)

            # Keep LD-pruned variants
            pruned_mt = (mt.filter_rows(hl.is_defined(
                pruned_variant_table[mt.row_key]),
                                        keep=True))
            pruned_mt.write(
                f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt',
                overwrite=args.overwrite)

        pruned_mt = hl.read_matrix_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt')
        v, s = pruned_mt.count()
        logger.info(f'{s} samples, {v} variants found in LD-pruned MT')

        pruned_mt = pruned_mt.select_entries(
            GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

        # run pc_relate method...compute all stats
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht',
            overwrite=args.overwrite)

        logger.info(f'Running PC-Relate...')
        scores = hl.read_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht'
        )
        relatedness_ht = hl.pc_relate(
            call_expr=pruned_mt.GT,
            min_individual_maf=args.min_individual_maf,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=args.min_kinship,
            statistics='all')

        logger.info(f'Writing relatedness table...')
        # Write/export table to file
        relatedness_ht.write(
            output=
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht',
            overwrite=args.overwrite)

        # Write PCs table to file (if specified)
        # if args.write_to_file:
        #    # Export table to file
        #    relatedness_ht.export(output=f'{args.ht_output_path}.tsv.bgz')

    # retrieve maximal independent set of related samples
    logger.info('Getting optimal set of related samples to prune...')

    relatedness_ht = hl.read_table(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht')

    relatedness_ht = (relatedness_ht.flatten().rename({
        'i.s': 'i',
        'j.s': 'j'
    }).repartition(100))

    # import trios info
    fam = import_fam_ht()
    mat_ids = hl.set(fam.mat_id.collect())
    fat_ids = hl.set(fam.pat_id.collect())

    # rank samples by retention priority (e.g. cases over controls)
    tb_rank = make_sample_rank_table(get_sample_meta_data())

    # apply min kinship to consider related pairs
    relatedness_ht = (relatedness_ht.filter(relatedness_ht.kin > MIN_KINSHIP))

    # run maximal_independent_set stratified by groups
    # Note: This method fails when considering all pairs together (e.g. it removes most of the index in trios, we want
    # keep them (index) since they are mostly affected individuals rather than parents).

    # defining pairs group
    # TODO: check groups with updated fam file
    relatedness_ht = (relatedness_ht.annotate(pairs_group=hl.case().when(
        relatedness_ht.kin > 0.40, 'twins_or_dups').when(
            mat_ids.contains(relatedness_ht.i)
            | mat_ids.contains(relatedness_ht.j), 'pairs_child_mat').when(
                fat_ids.contains(relatedness_ht.i)
                | fat_ids.contains(relatedness_ht.j),
                'pairs_child_fat').default('pairs_others')))

    groups = (relatedness_ht.aggregate(
        hl.agg.collect_as_set(relatedness_ht['pairs_group'])))
    tbs = []
    for pair_group in groups:
        pair_ht = relatedness_ht.filter(
            relatedness_ht.pairs_group == pair_group)
        tb = get_related_samples_to_drop(rank_table=tb_rank,
                                         relatedness_ht=pair_ht)
        tbs.append(tb)

    related_samples_to_remove = hl.Table.union(*tbs)

    related_samples_to_remove.describe()

    related_samples_to_remove = related_samples_to_remove.checkpoint(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.ht',
        overwrite=args.overwrite)

    if args.write_to_file:
        (related_samples_to_remove.flatten().export(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.tsv'
        ))

    hl.stop()