Пример #1
0
def test_hwe_normalized_pca():
    mt = hl.balding_nichols_model(3, 100, 50)
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=True)

    assert len(eigenvalues) == 2
    assert isinstance(scores, hl.Table)
    scores.count() == 100
    assert isinstance(loadings, hl.Table)

    _, _, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=False)
    assert loadings is None
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)

    # keep loci that are contained in the densified, filtered tob-wgs mt
    hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols()
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    # save this for population-level PCAs
    mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt')
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #3
0
def query(output, pop):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    if pop:
        # Get samples from the specified population only
        mt = mt.filter_cols(
            (mt.hgdp_1kg_metadata.population_inference.pop == pop.lower())
            | (mt.s.contains('TOB'))
        )
    else:
        mt = mt.filter_cols(mt.s.contains('TOB'))

    # Perform PCA
    eigenvalues_path = f'{output}/eigenvalues.ht'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt.GT, compute_loadings=True, k=20
    )
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #4
0
def hwe_normalized_pca(
        qc_mt: hl.MatrixTable,
        related_samples_to_drop: Optional[hl.Table] = None,
        n_pcs: int = 10
) -> Tuple[List[float], hl.Table, hl.Table]:
    """
    First runs PCA excluding the given related samples,
    then projects these samples in the PC space to return scores for all samples.
    The `related_samples_to_drop` Table has to be keyed by the sample ID and all samples present in this
    table will be excluded from the PCA.
    The loadings Table returned also contains a `pca_af` annotation which is the allele frequency
    used for PCA. This is useful to project other samples in the PC space.
    :param qc_mt: Input QC MT
    :param related_samples_to_drop: Optional table of related samples to drop
    :param n_pcs: Number of PCs to compute
    :param autosomes_only: Whether to run the analysis on autosomes only
    :return: eigenvalues, scores and loadings
    """
    unrelated_mt = qc_mt

    if related_samples_to_drop:
        unrelated_mt = qc_mt.filter_cols(hl.is_missing(related_samples_to_drop[qc_mt.col_key]))

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(unrelated_mt.GT, k=n_pcs, compute_loadings=True)
    pca_af_ht = unrelated_mt.annotate_rows(pca_af=hl.agg.mean(unrelated_mt.GT.n_alt_alleles()) / 2).rows()
    pca_loadings = pca_loadings.annotate(pca_af=pca_af_ht[pca_loadings.key].pca_af)

    if not related_samples_to_drop:
        return pca_evals, pca_scores, pca_loadings
    else:
        related_mt = qc_mt.filter_cols(hl.is_defined(related_samples_to_drop[qc_mt.col_key]))
        related_scores = pc_project(related_mt, pca_loadings)
        pca_scores = pca_scores.union(related_scores)
        return pca_evals, pca_scores, pca_loadings
Пример #5
0
def compute_relatedness(
    data_type: str = "genomes",
    overwrite: bool = False,
) -> hl.Table:
    """
    Perform sample QC on the split VDS table using `compute_stratified_sample_qc`.
    :param data_type: Whether data is from genomes or exomes, default is genomes
    :param overwrite: Whether to overwrite the file
    :return: Table table after running pc_relate
    :rtype: hl.Table
    """
    logger.info("Computing relatedness table on CCDG %s VDS", data_type)
    pca_var_ht = hl.read_table(get_pca_variants_path())
    mt = hl.vds.to_dense_mt(get_qc_vds(data_type))
    mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key]))
    eig, scores, _ = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=False)
    scores = scores.checkpoint(
        get_ccdg_results_path(data_type=data_type, result="pc_scores"),
        overwrite=overwrite,
        _read_if_exists=not overwrite,
    )
    relatedness_ht = hl.pc_relate(
        mt.GT,
        min_individual_maf=0.01,
        scores_expr=scores[mt.col_key].scores,
        block_size=4096,
        min_kinship=0.05,
        statistics="all",
    )
    return relatedness_ht.checkpoint(
        get_ccdg_results_path(data_type=data_type, result="relatedness"),
        overwrite=overwrite,
        _read_if_exists=(not overwrite),
    )
Пример #6
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles')

    # filter to loci that are contained in snp-chip data after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)
    ).select_cols()
    snp_chip = snp_chip.select_entries(snp_chip.GT).select_cols()
    snp_chip = snp_chip.key_cols_by(s=snp_chip.s + '_snp_chip')
    tob_combined = tob_wgs.union_cols(snp_chip)
    tob_combined = tob_combined.cache()
    print(tob_combined.count_rows())

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        tob_combined.GT, compute_loadings=True, k=20
    )
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #7
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Пример #8
0
def test_pcrelate_paths():
    mt = hl.balding_nichols_model(3, 50, 100)
    _, scores3, _ = hl.hwe_normalized_pca(mt.GT, k=3, compute_loadings=False)

    kin1 = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin', block_size=64)
    kin2 = hl.pc_relate(mt.GT,
                        0.05,
                        k=2,
                        min_kinship=0.01,
                        statistics='kin2',
                        block_size=128).cache()
    kin3 = hl.pc_relate(mt.GT,
                        0.02,
                        k=3,
                        min_kinship=0.1,
                        statistics='kin20',
                        block_size=64).cache()
    kin_s1 = hl.pc_relate(mt.GT,
                          0.10,
                          scores_expr=scores3[mt.col_key].scores[:2],
                          statistics='kin',
                          block_size=32)

    assert kin1._same(kin_s1, tolerance=1e-4)

    assert kin1.count() == 50 * 49 / 2

    assert kin2.count() > 0
    assert kin2.filter(kin2.kin < 0.01).count() == 0

    assert kin3.count() > 0
    assert kin3.filter(kin3.kin < 0.1).count() == 0
Пример #9
0
def query(output):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    eigenvalues_path = f'{output}/eigenvalues_10k.csv'
    scores_path = f'{output}/scores_10k.ht'
    loadings_path = f'{output}/loadings_10k.ht'
    downsampled_mt_path = f'{output}/downsampled_mt.mt'

    # filter out variants with a call rate <0.99 and variants where there
    # is no non-reference allele called.
    mt_qc = hl.variant_qc(mt)
    filt_mt = mt_qc.filter_rows((mt_qc.variant_qc.call_rate >= 0.99)
                                & (mt_qc.variant_qc.n_non_ref >= 1))
    nrows = filt_mt.count_rows()
    # Downsample the dataset to approximately 10k randomly-selected rows
    # (the input must be a proportion)
    downsampled_mt = filt_mt.sample_rows(10000 / nrows, seed=12345)

    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        downsampled_mt.GT, compute_loadings=True, k=20)
    # save the list of eigenvalues
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    # save the scores, loadings, and downsampled matrix table
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
    downsampled_mt.write(downsampled_mt_path, overwrite=True)
Пример #10
0
 def test_pc_project(self):
     mt = hl.balding_nichols_model(3, 100, 50)
     _, _, loadings_ht = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=True)
     mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
     loadings_ht = loadings_ht.annotate(af=mt.rows()[loadings_ht.key].af)
     mt_to_project = hl.balding_nichols_model(3, 100, 50)
     ht = hl.experimental.pc_project(mt_to_project.GT, loadings_ht.loadings, loadings_ht.af)
     assert ht._force_count() == 100
Пример #11
0
def joint_pca(
        ref_dirname:
    str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/',
        ref_basename: str = 'unrelated',
        in_mt: hl.MatrixTable = None,
        data_basename: str = None,
        npcs: int = 20,
        out_dir: str = None):
    """
    Merges input dataset with ref by [locus, alleles] and runs PCA on merged dataset
    :param ref_dirname: directory name where reference data is
    :param ref_basename: base filename for reference data
    :param in_mt: input data MatrixTable
    :param data_basename: base filename for input data
    :param npcs: number of principal components to be used in PCA
    :param out_dir: output directory where files are going to be saved to
    :return:
    """
    print('\nReading reference data mt')
    ref_mt = hl.read_matrix_table(f'{ref_dirname}{ref_basename}.mt')

    # We need to unkey the datasets and take only cols common between the two in order to be able to merge in Hail
    ref_mt = ref_mt.key_cols_by().key_rows_by()
    ref_downsampled = ref_mt.select_cols('s').select_rows(
        'locus', 'alleles').select_entries('GT')
    ref_downsampled = ref_downsampled.key_cols_by('s').key_rows_by(
        'locus', 'alleles')

    data_mt = in_mt.key_cols_by().key_rows_by()
    data_downsampled = data_mt.select_cols('s').select_rows(
        'locus', 'alleles').select_entries('GT')
    data_downsampled = data_downsampled.key_cols_by('s').key_rows_by(
        'locus', 'alleles')

    print('\nJoining Data with Ref by locus & alleles')
    joined = ref_downsampled.union_cols(data_downsampled)

    pca_snps = joined.count_rows()
    if pca_snps > 1000000:
        import warnings
        warnings.warn(
            f'Too many SNPs to be used in PCA: {pca_snps}. This will make PCA run longer'
        )

    print(f'\nRunning PCA with {npcs} principal components')
    pca_evals, pca_scores, _ = hl.hwe_normalized_pca(joined.GT, k=npcs)

    pca_scores = pca_scores.transmute(
        **{f'PC{i}': pca_scores.scores[i - 1]
           for i in range(1, npcs + 1)})
    print(f'\nExporting PCA scores to {out_dir}')
    pca_scores.export(
        f'{out_dir}GWASpy/PCA/{data_basename}/pca_joint/{data_basename}.1kg_hgdp.joint.pca.scores.txt.bgz'
    )
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    # Get samples from the specified population only
    mt = mt.filter_cols((
        mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
                        | (mt.s.contains('TOB')))
    # remove outlier samples, as identified by PCA
    outliers = [
        'TOB1734',
        'TOB1714',
        'TOB1126',
        'TOB1653',
        'TOB1668',
        'TOB1681',
        'TOB1116',
        'TOB1107',
        'TOB1635',
        'HG01628',
        'TOB1675',
        'TOB1125',
        'TOB1762',
        'TOB1263',
        'TOB1640',
        'HG01669',
        'TOB1795',
        'TOB1707',
        'HG01695',
        'HG01694',
        'TOB1673',
        'HG01630',
    ]

    mt = mt.filter_cols(hl.literal(outliers).contains(mt.s), keep=False)

    # Remove related samples at the 2nd degree or closer, as indicated by gnomAD
    mt = mt.filter_cols(mt.hgdp_1kg_metadata.gnomad_release
                        | mt.s.startswith('TOB'))

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt.GT, compute_loadings=True, k=20)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #13
0
def query(output, pop):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    if pop:
        # Get samples from the specified population only
        mt = mt.filter_cols((
            mt.hgdp_1kg_metadata.population_inference.pop == pop.lower())
                            | (mt.s.contains('TOB')))
    else:
        mt = mt.filter_cols(mt.s.contains('TOB'))

    # Perform PCA
    eigenvalues_path = f'{output}/eigenvalues.csv'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt.GT, compute_loadings=True, k=20)
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)

    # get TOB-WGS allele frequencies
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    tob_wgs = tob_wgs.annotate_rows(
        gt_stats=hl.agg.call_stats(tob_wgs.GT, tob_wgs.alleles))

    # Get gnomAD allele frequency of variants that aren't in TOB-WGS
    loadings_gnomad = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by(
        'locus', 'alleles')
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    hgdp_1kg_row = hgdp_1kg.rows()[loadings_gnomad.locus,
                                   loadings_gnomad.alleles]
    tob_wgs_row = tob_wgs.rows()[loadings_gnomad.locus,
                                 loadings_gnomad.alleles]
    loadings_gnomad = loadings_gnomad.annotate(
        gnomad_AF=hgdp_1kg_row.gnomad_freq.AF,
        gnomad_popmax_AF=hgdp_1kg_row.gnomad_popmax.AF,
        TOB_WGS_AF=tob_wgs_row.gt_stats.AF,
    )
    population_af_metadata = hgdp_1kg.gnomad_freq_meta.collect()
    loadings_gnomad = loadings_gnomad.annotate_globals(
        gnomad_freq_meta=population_af_metadata)
    gnomad_variants = loadings_gnomad.drop('loadings')
    gnomad_variants_path = f'{output}/gnomad_annotated_variants.mt'
    gnomad_variants.write(gnomad_variants_path)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP)
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    # Perform PCA
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        snp_chip.GT, compute_loadings=True, k=5)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #15
0
def project_pcs_relateds(mt_ldpruned, mt, covar_pc_num):
    """
    Tales LD pruned matrix table, calculates PCs, and projects those PCs back to related individuals included in mt
    :param mt_ldpruned: matrix table with relatives removed, maf and ld pruned
    :param mt: matrix table with relatives included
    :param covar_pc_num: Number of principal components as covariates to calculate
    :return: returns matrix table with relatives, with PCs annotated
    """
    logging.info('Calculating principal components, annotating main dataset.')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt_ldpruned.GT, k=covar_pc_num, compute_loadings=True)

    # Project PCs to related individuals
    # mt of related individuals only, not pop outliers or failing samples QC
    related_mt = mt.filter_cols(
        (mt.related_to_remove == True) & (mt.pop_outlier_sample == False) &
        (hl.len(mt.failing_samples_qc) == 0),
        keep=True)
    mt_ldpruned = mt_ldpruned.annotate_rows(
        pca_af=hl.agg.mean(mt_ldpruned.GT.n_alt_alleles()) / 2)
    mtrows = mt_ldpruned.rows()
    loadings = loadings.annotate(pca_af=mtrows[loadings.locus,
                                               loadings.alleles].pca_af)
    related_scores = pc_project(related_mt, loadings)

    # Add pcs as annotations to main table
    mt = mt.annotate_cols(**{
        'pc' + str(k + 1): scores[mt.s].scores[k]
        for k in range(covar_pc_num)
    })
    # Explanation: for k principal components in range 0 to covar_pc_num-1,
    # make pc k+1 (to start at pc1 instead of pc0) be the corresponding score (keyed by mt.s) from the table scores

    # Add pcs for related individuals
    mt = mt.annotate_cols(
        **{
            'pc' +
            str(k + 1): hl.or_else(mt['pc' +
                                      str(k +
                                          1)], related_scores[mt.s].scores[k])
            for k in range(covar_pc_num)
        })
    # Explanation: for k principal components in range from 0 to (covar_pc_num-1)
    # give either the existing pcX, or if missing give the corresponding score (keyed by mt.s)
    # from the table related_scores

    return mt
def run_pca(prune_out: hl.MatrixTable,
            pca_prefix: str,
            overwrite: bool = False):
    """
    Run PCA on a dataset
    :param mt: dataset to run PCA on
    :param pca_prefix: directory and filename prefix for where to put PCA output
    :return:
    """

    mt = hl.read_matrix_table(prune_out)

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    pca_scores.write(pca_prefix + 'scores.ht', overwrite)
    pca_scores = hl.read_table(pca_prefix + 'scores.ht')
    pca_scores = pca_scores.transmute(
        **{f'PC{i}': pca_scores.scores[i - 1]
           for i in range(1, 21)})
    pca_scores.export(pca_prefix + 'scores.txt.bgz')  # individual-level PCs

    pca_loadings.export(pca_prefix + 'loadings.txt.bgz')

    pca_loadings.write(pca_prefix + 'loadings.ht', overwrite)  # PCA loadings

    #export loadings in plink format
    ht = hl.read_table(pca_prefix + 'loadings.ht')
    ht = ht.key_by()
    ht_loadings = ht.select(
        ID=hl.variant_str(ht.locus, ht.alleles),
        ALT=ht.alleles[1],
        **{f"PC{i}": ht.loadings[i - 1]
           for i in range(1, 21)})
    ht_afreq = ht.select(
        **{
            "#ID": hl.variant_str(ht.locus, ht.alleles),
            "REF": ht.alleles[0],
            "ALT": ht.alleles[1],
            "ALT1_FREQ": ht.pca_af
        })
    ht_loadings.export(pca_prefix + 'loadings.plink.tsv')
    ht_afreq.export(pca_prefix + 'loadings.plink.afreq')
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    loadings = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by(
        'locus', 'alleles')

    # filter to loci that are contained in both tables and the loadings after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)
    hgdp_1kg = hgdp_1kg.filter_rows(
        hl.is_defined(loadings.index(hgdp_1kg['locus'], hgdp_1kg['alleles']))
        & hl.is_defined(
            tob_wgs.index_rows(hgdp_1kg['locus'], hgdp_1kg['alleles'])))
    tob_wgs = tob_wgs.semi_join_rows(hgdp_1kg.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT)
    hgdp_1kg_select = hgdp_1kg_select.select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    mt_path = f'{output}/hgdp1kg_tobwgs_joined_all_samples.mt'
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)
    hgdp1kg_tobwgs_joined = hl.read_matrix_table(mt_path)

    # Perform PCA
    eigenvalues_path = f'{output}/eigenvalues.csv'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    # save the list of eigenvalues
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    # save the scores and loadings as a hail table
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #18
0
def run_pca(my_data, out_prefix):
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        my_data.GT, k=20, compute_loadings=True)
    pca_mt = my_data.annotate_rows(
        pca_af=hl.agg.mean(my_data.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    pca_scores.write(out_prefix + 'scores.ht', args.overwrite)
    pca_scores = hl.read_table(out_prefix + 'scores.ht')
    pca_scores = pca_scores.transmute(
        **{f'PC{i}': pca_scores.scores[i - 1]
           for i in range(1, 21)})
    pca_scores.export(out_prefix + 'scores.txt.bgz')  # individual-level PCs

    pca_loadings.write(out_prefix + 'loadings.ht',
                       args.overwrite)  # PCA loadings
Пример #19
0
def query(output):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    eigenvalues_path = f'{output}/eigenvalues.csv'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    # test on 100 samples
    mt_head = mt.head(n=mt.count_rows(), n_cols=100)
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt_head.GT, compute_loadings=True, k=20
    )
    # save the list of eigenvalues
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    # save the scores and loadings as a hail table
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #20
0
def run_pca(mt: hl.MatrixTable, out_prefix: str, overwrite: bool = False):
    """
    Run PCA on a dataset
    :param mt: dataset to run PCA on
    :param out_prefix: directory and filename prefix for where to put PCA output
    :return:
    """
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    pca_scores.write(out_prefix + 'scores.ht', overwrite)
    pca_scores = hl.read_table(out_prefix + 'scores.ht')
    pca_scores = pca_scores.transmute(
        **{f'PC{i}': pca_scores.scores[i - 1]
           for i in range(1, 21)})
    pca_scores.export(out_prefix + 'scores.txt.bgz')  # individual-level PCs

    pca_loadings.write(out_prefix + 'loadings.ht', overwrite)  # PCA loadings
Пример #21
0
def main(args):
    if args.join_qc_mt:
        v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True)
        v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes")
        v3_qc_mt = qc.mt()
        v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release)
        v3_qc_mt = v3_qc_mt.select_rows().select_cols()
        v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes")
        joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt)
        joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PC-Relate')
        logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!")
        joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt")
        joint_qc_mt = joint_qc_mt.sample_rows(0.1)
        eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False)
        scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite)
        relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores,
                                      block_size=4096, min_kinship=0.1, statistics='all')
        relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
Пример #22
0
def run_ref_pca(
        mt: hl.MatrixTable = None,
        npcs: int = 20,
        data_basename: str = None,
        out_dir: str = None):
    """
    Run PCA on a dataset
    :param mt: dataset to run PCA on
    :param npcs: number of principal components to be used in PCA
    :param data_basename: input data basename so outputs can be saved in correct dir
    :param out_dir: directory and filename prefix for where to put PCA output
    :return:
    """
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, k=npcs, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(pca_af=pca_mt.rows()[pca_loadings.key].pca_af)

    # pca_scores.write(out_dir + 'GWASpy/PCA/' + '1000G_scores.ht', overwrite=True)
    # pca_scores = hl.read_table(out_dir + 'GWASpy/PCA/' + '1000G_scores.ht')
    pca_scores = pca_scores.transmute(**{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, npcs+1)})
    pca_scores.export(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp.project.pca.scores.txt.bgz')  # individual-level PCs

    pca_loadings.write(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_loadings.ht', overwrite=True)  # PCA loadings
Пример #23
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Пример #24
0
print(num_rows)

prob = min(1, 80000 / num_rows)
vds = vds.sample_rows(prob)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# pca
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print('filter VDS...')
vds = vds.filter_cols(hl.is_defined(rel_exclusion[vds.s]), keep=False)
vds = vds.filter_rows(hl.is_defined(mhc_chr8inv[vds.locus]), keep=False)
vds = vds.filter_rows(
    (vds.locus.contig == "chrX") | (vds.locus.contig == "chrY"), keep=False)

print('PCA...')
eigenvalues, scores, loadings = hl.hwe_normalized_pca(vds, k=10)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write output
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

with hl.utils.hadoop_open(pca_value_file, 'w') as f:
    for val in eigenvalues:
        f.write(str(val) + '\n')
scores.flatten().export(pca_score_file)

# print runtime
stop = timeit.default_timer()
print("runtime: " + str(stop - start) + " seconds")
#print(mt.count()) (12194564, 1092)

#annotate MT file
table = (hl.import_table('gs://ines-work/KG-annotation-with-sexencoder.csv',
                         delimiter=',',
                         missing='',
                         quote='"',
                         types={
                             'Gender_Classification': hl.tfloat64
                         }).key_by('Sample'))
mt = mt.annotate_cols(**table[mt.s])

#print(mt.aggregate_cols(agg.counter(mt.Gender_Classification))) {'0.0': 567, '1.0': 525}

#pca
pca_eigenvalues, pca_scores, _ = hl.hwe_normalized_pca(mt.GT, k=2)

mt = mt.annotate_cols(pca=pca_scores[mt.s])
x = pca_scores.scores[0]
y = pca_scores.scores[1]
label = mt.cols()[pca_scores.s].Super_Population
collect_all = nullable(bool)

if isinstance(x, Expression) and isinstance(y, Expression):
    agg_f = x._aggregation_method()
    if isinstance(label, Expression):
        if collect_all:
            res = hail.tuple([x, y, label]).collect()
            label = [point[2] for point in res]
        else:
            res = agg_f(
def main(args):
    mt = hl.read_matrix_table(args.matrixtable)
    # ld pruning
    pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key]))
    pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True)

    # PC relate
    pruned_mt = pruned_mt.select_entries(
        GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

    eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                           k=10,
                                           compute_loadings=False)
    scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True)

    relatedness_ht = hl.pc_relate(pruned_mt.GT,
                                  min_individual_maf=0.05,
                                  scores_expr=scores[pruned_mt.col_key].scores,
                                  block_size=4096,
                                  min_kinship=0.05,
                                  statistics='kin2')
    relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht",
                         overwrite=True)
    pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                           pairs.j,
                                                           keep=False)
    related_samples_to_remove.write(
        f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True)

    pca_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                   keep=False)
    related_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                       keep=True)

    variants, samples = pca_mt.count()

    print(f"{samples} samples after relatedness step.")

    # Population pca

    plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid')
    hl.export_plink(plink_mt,
                    f"{args.output_dir}/mt_unrelated.plink",
                    fam_id=plink_mt.uid,
                    ind_id=plink_mt.uid)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pca_mt.GT, k=20, compute_loadings=True)
    pca_af_ht = pca_mt.annotate_rows(
        pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_af_ht[pca_loadings.key].pca_af)
    pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True)
    pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True)

    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    print(
        'Projecting population PCs for {} related samples...'.format(samples))
    #related_scores = pc_project(related_mt, pca_loadings)
    #relateds = related_mt.cols()
    #relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True)
    p = hl.plot.scatter(pca_mt.scores[0],
                        pca_mt.scores[1],
                        title='PCA',
                        xlabel='PC1',
                        ylabel='PC2')
    output_file(f"{args.plot_dir}/pca.html")
    save(p)
Пример #27
0
def hwe_normalized_pca():
    mt = get_mt()
    mt = mt.filter_rows(mt.info.AF[0] > 0.01)
    hl.hwe_normalized_pca(mt.GT)
Пример #28
0
mt = mt.annotate_rows(qc=mt.qc.annotate(p_value_hwe=hl.case().when(
    mt.locus.in_autosome(), mt.qc.het_freq_hwe).default(
        hl.agg.filter(mt.imputesex.impute_sex.is_female,
                      hl.agg.hardy_weinberg_test(mt.GT).het_freq_hwe))))

mt = mt.annotate_rows(annotation=mt.annotation.annotate(
    info=mt.annotation.info.annotate(
        AC=mt.annotation.info.AC[mt.annotation.a_index - 1],
        AF=mt.annotation.info.AF[mt.annotation.a_index - 1],
    )))

mt = hl.sample_qc(mt)

mt_pca = mt.filter_rows(hl.is_defined(ht_final_pruned_variants[mt.row_key]))

pca_output = hl.hwe_normalized_pca(mt_pca.GT, k=10)
pca_output = pca_output[1].key_by('s')
pca_output = pca_output.annotate(PC1=pca_output.scores[0],
                                 PC2=pca_output.scores[1],
                                 PC3=pca_output.scores[2],
                                 PC4=pca_output.scores[3],
                                 PC5=pca_output.scores[4],
                                 PC6=pca_output.scores[5],
                                 PC7=pca_output.scores[6],
                                 PC8=pca_output.scores[7],
                                 PC9=pca_output.scores[8],
                                 PC10=pca_output.scores[9])

mt = mt.annotate_cols(pca=pca_output[mt.s])

n = mt.count()
Пример #29
0
def hwe_normalized_pca():
    mt = get_mt()
    mt = mt.filter_rows(mt.info.AF[0] > 0.01)
    hl.hwe_normalized_pca(mt.GT)
    #pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000)
    pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.1)

    pruned_mt = mt_vqc_filtered.filter_rows(
        hl.is_defined(pruned_ht[mt_vqc_filtered.row_key]))

    pruned_mt = pruned_mt.filter_rows(hl.is_defined(
        bed_to_exclude_pca[pruned_mt.locus]),
                                      keep=False)

    pruned_mt.write(
        f"{tmp_dir}/ddd-elgh-ukbb/1000g_chr1_20_snps_filtered_ldpruned.mt",
        overwrite=True)
    # run pca
    logger.info("run pca")
    pca_evals, pca_scores, loadings_ht = hl.hwe_normalized_pca(
        pruned_mt.GT, k=10, compute_loadings=True)
    pruned_mt = pruned_mt.annotate_rows(
        af=hl.agg.mean(pruned_mt.GT.n_alt_alleles()) / 2)
    loadings_ht = loadings_ht.annotate(af=pruned_mt.rows()[loadings_ht.key].af)
    pca_scores.write(f"{tmp_dir}/ddd-elgh-ukbb/100g_pca_scores.ht",
                     overwrite=True)
    loadings_ht.write(f"{tmp_dir}/ddd-elgh-ukbb/1000g_pca_loadings.ht",
                      overwrite=True)
    with open(f"{temp_dir}/ddd-elgh-ukbb/1000g_pca_evals.txt", 'w') as f:
        for val in pca_evals:
            f.write(str(val))

    ht = pc_project(project_mt.GT, loadings_ht.loadings, loadings_ht.af)
    ht.write(f"{tmp_dir}/ddd-elgh-ukbb/pc_project_our_data.ht", overwrite=True)
Пример #31
0
                                  10000,
                                  pop_dist=[0.1, 0.2, 0.3, 0.2, 0.2],
                                  fst=[.02, .06, .04, .12, .08],
                                  af_dist=hl.rand_beta(a=0.01,
                                                       b=2.0,
                                                       lower=0.05,
                                                       upper=1.0),
                                  mixture=True)
    mt = hl.variant_qc(mt)
    mt.write('bn.mt', overwrite=True)

mt = hl.read_matrix_table('bn.mt')

if not hl.hadoop_exists('scores.t'):
    # Generate data for demonstratation purposes, this should already exist
    scores = hl.hwe_normalized_pca(mt.GT, k=5)[1]
    scores = scores.annotate(**mt.cols()[scores.sample_idx])
    scores.write('scores.t')

pcs = hl.read_table('scores.t')


@routes.get('')
@routes.get('/')
async def get_sha(request):  # pylint: disable=unused-argument
    arr = pcs.collect()
    pca_plot = px.scatter_3d([{
        'id': x['sample_idx'],
        'pop': np.argmax(x['pop']),
        **{f'PC{i}': x['scores'][i]
           for i in range(5)}